// 构造一个tri-trainer分类器。 public Tritrainer( String classifier, String trainingIns_File, String testIns_File, double precentage) { try { this.classifier1 = (Classifier) Class.forName(classifier).newInstance(); this.classifier2 = (Classifier) Class.forName(classifier).newInstance(); this.classifier3 = (Classifier) Class.forName(classifier).newInstance(); Instances trainingInstances = Util.getInstances(trainingIns_File); // 将trainIns_File按照precentage和(1-precentage)的比例切割成labeledIns和unlabeledIns; int length = trainingInstances.numInstances(); int i = new Double(length * precentage).intValue(); labeledIns = new Instances(trainingInstances, 0); for (int j = 0; j < i; j++) { labeledIns.add(trainingInstances.firstInstance()); trainingInstances.delete(0); } unlabeledIns = trainingInstances; testIns = Util.getInstances(testIns_File); Init(); } catch (Exception e) { } }
// 将样本集中裁剪提取成m个样本组成的集合; public void SubSample(Instances inst, int m) { inst.randomize(new Random()); while (inst.numInstances() != m) { inst.delete(0); } // System.out.println("subsample:=" + inst.numInstances() + " m:=" + m ); }
/** * This will remove all buffered instances from the inputformat dataset. Use this method rather * than getInputFormat().delete(); */ protected void flushInput() { if ((m_InputStringAtts.getAttributeIndices().length > 0) || (m_InputRelAtts.getAttributeIndices().length > 0)) { m_InputFormat = m_InputFormat.stringFreeStructure(); m_InputStringAtts = new StringLocator(m_InputFormat, m_InputStringAtts.getAllowedIndices()); m_InputRelAtts = new RelationalLocator(m_InputFormat, m_InputRelAtts.getAllowedIndices()); } else { // This more efficient than new Instances(m_InputFormat, 0); m_InputFormat.delete(); } }
/** * Process the input and compute the statistics of the training data * * @return an Input object which holds the statistics about the training data */ private Input sIB_ProcessInput() { double valSum = 0.0; for (int i = 0; i < m_numInstances; i++) { valSum = 0.0; for (int v = 0; v < m_data.instance(i).numValues(); v++) { valSum += m_data.instance(i).valueSparse(v); } if (valSum <= 0) { if (m_verbose) { System.out.format("Instance %s sum of value = %s <= 0, removed.\n", i, valSum); } m_data.delete(i); m_numInstances--; } } // get the term-document matrix Input input = new Input(); input.Py_x = getTransposedNormedMatrix(m_data); if (m_uniformPrior) { input.Pyx = input.Py_x.copy(); normalizePrior(m_data); } else { input.Pyx = getTransposedMatrix(m_data); } input.sumVals = getTotalSum(m_data); input.Pyx.timesEquals(1 / input.sumVals); // prior probability of documents, ie. sum the columns from the Pyx matrix input.Px = new double[m_numInstances]; for (int i = 0; i < m_numInstances; i++) { for (int j = 0; j < m_numAttributes; j++) { input.Px[i] += input.Pyx.get(j, i); } } // prior probability of terms, ie. sum the rows from the Pyx matrix input.Py = new double[m_numAttributes]; for (int i = 0; i < input.Pyx.getRowDimension(); i++) { for (int j = 0; j < input.Pyx.getColumnDimension(); j++) { input.Py[i] += input.Pyx.get(i, j); } } MI(input.Pyx, input); return input; }
/** * Read the data set incrementally---get the next instance in the data set or returns null if * there are no more instances to get. If the structure hasn't yet been determined by a call to * getStructure then method should do so before returning the next instance in the data set. * * @return the next instance in the data set as an Instance object or null if there are no more * instances to be read * @exception IOException if there is an error during parsing */ public Instance getNextInstance() throws IOException { if (m_structure == null) { getStructure(); } if (getRetrieval() == BATCH) { throw new IOException("Cannot mix getting Instances in both incremental and batch modes"); } setRetrieval(INCREMENTAL); if (!m_structure.readInstance(m_sourceReader)) { return null; } Instance current = m_structure.instance(0); m_structure.delete(0); return current; }
/** * Adds the supplied instance to the training set. * * @param instance the instance to add * @throws Exception if instance could not be incorporated successfully */ public void updateClassifier(Instance instance) throws Exception { if (m_Train.equalHeaders(instance.dataset()) == false) { throw new Exception("Incompatible instance types"); } if (instance.classIsMissing()) { return; } m_Train.add(instance); m_NNSearch.update(instance); m_kNNValid = false; if ((m_WindowSize > 0) && (m_Train.numInstances() > m_WindowSize)) { boolean deletedInstance = false; while (m_Train.numInstances() > m_WindowSize) { m_Train.delete(0); deletedInstance = true; } // rebuild datastructure KDTree currently can't delete if (deletedInstance == true) m_NNSearch.setInstances(m_Train); } }
/** * Process an instance. When the batch size is met then a batch of predictions is returned; * returns null if the batch size has not been matched yet * * @param inst the instance to process * @return a batch of predictions or null if we have not seen enough input instances to meet the * batch size yet * @throws DistributedWekaException if a problem occurs */ public double[][] processInstanceBatchPredictor(Instance inst) throws DistributedWekaException { inst = mapIncomingFieldsToModelFields(inst); m_batchScoringData.add(inst); if (m_batchScoringData.numInstances() == m_batchSize) { try { double[][] predictions = m_model.distributionsForInstances(m_batchScoringData); if (predictions.length != m_batchScoringData.numInstances()) { throw new Exception( "Number of predictions did not match the number " + "of instances in the batch"); } m_batchScoringData.delete(); return predictions; } catch (Exception ex) { throw new DistributedWekaException(ex); } } return null; }
public static void execSVM(String expName) { try { FileWriter outFile = null; PrintWriter out = null; outFile = new FileWriter(expName + "-SVM.results"); out = new PrintWriter(outFile); DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); ProcessTweets tweetsProcessor = null; System.out.println("***************************************"); System.out.println("***\tEXECUTING TEST\t" + expName + "***"); System.out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("***************************************"); out.println("***\tEXECUTING TEST\t" + expName + "***"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("4-Generate classifier " + dateFormat.format(new Date())); Classifier cls = null; DataSource sourceTrain = new DataSource(expName + "-train.arff"); Instances dataTrain = sourceTrain.getDataSet(); if (dataTrain.classIndex() == -1) dataTrain.setClassIndex(dataTrain.numAttributes() - 1); // Entreno el clasificador // cls = new weka.classifiers.functions.LibSVM(); int clase = dataTrain.numAttributes() - 1; cls = new weka.classifiers.bayes.ComplementNaiveBayes(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(expName + "-SVM.classifier")); oos.writeObject(cls); oos.flush(); oos.close(); DataSource sourceTest = new DataSource(expName + "-test.arff"); Instances dataTest = sourceTest.getDataSet(); dataTest.setClassIndex(clase); Evaluation eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); // Ahora calculo los valores precision, recall y fmeasure. Además saco las matrices de // confusion float precision = 0; float recall = 0; float fmeasure = 0; int topeTopics = 8; for (int ind = 0; ind < topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ CNB ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ CNB ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); // OTRO CLASIFICADOR ZeroR cls = new weka.classifiers.rules.ZeroR(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); precision = 0; recall = 0; fmeasure = 0; for (int ind = 0; ind < topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ ZEROR ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ ZEROR ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); // OTRO CLASIFICADOR J48 /* cls = new weka.classifiers.trees.J48(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); precision=0; recall=0; fmeasure=0; for(int ind=0; ind<topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ J48 ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ J48 ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); //OTRO SMO cls = new weka.classifiers.functions.SMO(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); precision=0; recall=0; fmeasure=0; for(int ind=0; ind<topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ SMO ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ SMO ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); */ out.flush(); out.close(); dataTest.delete(); dataTrain.delete(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public void exec(PrintWriter printer) { try { FileWriter outFile = null; PrintWriter out = null; if (printer == null) { outFile = new FileWriter(id + ".results"); out = new PrintWriter(outFile); } else out = printer; DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); ProcessTweets tweetsProcessor = null; System.out.println("***************************************"); System.out.println("***\tEXECUTING TEST\t" + id + "***"); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println("Train size:" + traincorpus.size()); System.out.println("Test size:" + testcorpus.size()); out.println("***************************************"); out.println("***\tEXECUTING TEST\t***"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("Train size:" + traincorpus.size()); out.println("Test size:" + testcorpus.size()); String cloneID = ""; boolean clonar = false; if (baseline) { System.out.println("***************************************"); System.out.println("***\tEXECUTING TEST BASELINE\t***"); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println("Train size:" + traincorpus.size()); System.out.println("Test size:" + testcorpus.size()); out.println("***************************************"); out.println("***\tEXECUTING TEST\t***"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("Train size:" + traincorpus.size()); out.println("Test size:" + testcorpus.size()); BaselineClassifier base = new BaselineClassifier(testcorpus, 8); precision = base.getPrecision(); recall = base.getRecall(); fmeasure = base.getFmeasure(); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); out.flush(); out.close(); return; } else { System.out.println("Stemming: " + stemming); System.out.println("Lematization:" + lematization); System.out.println("URLs:" + urls); System.out.println("Hashtags:" + hashtags); System.out.println("Mentions:" + mentions); System.out.println("Unigrams:" + unigrams); System.out.println("Bigrams:" + bigrams); System.out.println("TF:" + tf); System.out.println("TF-IDF:" + tfidf); out.println("Stemming: " + stemming); out.println("Lematization:" + lematization); out.println("URLs:" + urls); out.println("Hashtags:" + hashtags); out.println("Mentions:" + mentions); out.println("Unigrams:" + unigrams); out.println("Bigrams:" + bigrams); out.println("TF:" + tf); out.println("TF-IDF:" + tfidf); } // Si tengo los tweets procesados, me evito un nuevo proceso System.out.println("1-Process tweets " + dateFormat.format(new Date())); out.println("1-Process tweets " + dateFormat.format(new Date())); List<ProcessedTweet> train = null; String[] ids = id.split("-"); cloneID = ids[0] + "-" + (Integer.valueOf(ids[1]) + 6); if (((Integer.valueOf(ids[1]) / 6) % 2) == 0) clonar = true; if (new File(id + "-train.ptweets").exists()) { train = ProcessedTweetSerialization.fromFile(id + "-train.ptweets"); tweetsProcessor = new ProcessTweets(stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); if (lematization) { tweetsProcessor.doLematization(train); } if (stemming) { tweetsProcessor.doStemming(train); } } else { tweetsProcessor = new ProcessTweets(stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); // Esto del set training es un añadido para poder diferenciar los idiomas de las url en el // corpus paralelo // tweetsProcessor.setTraining(true); train = tweetsProcessor.processTweets(traincorpus); // tweetsProcessor.setTraining(false); ProcessedTweetSerialization.toFile(id + "-train.ptweets", train); /* if (clonar) { File f = new File (id+"-train.ptweets"); Path p = f.toPath(); CopyOption[] options = new CopyOption[]{ StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.COPY_ATTRIBUTES }; Files.copy(p, new File (cloneID+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+12)+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+18)+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+24)+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+30)+"-train.ptweets").toPath(), options); } */ } // Generamos las BOW. Igual que antes, si existen no las creo. System.out.println("2-Fill topics " + dateFormat.format(new Date())); out.println("2-Fill topics " + dateFormat.format(new Date())); TopicsList topics = null; if (new File(id + ".topics").exists()) { topics = TopicsSerialization.fromFile(id + ".topics"); if (tf) topics.setSelectionFeature(TopicDesc.TERM_TF); else topics.setSelectionFeature(TopicDesc.TERM_TF_IDF); topics.prepareTopics(); } else { topics = new TopicsList(); if (tf) topics.setSelectionFeature(TopicDesc.TERM_TF); else topics.setSelectionFeature(TopicDesc.TERM_TF_IDF); System.out.println("Filling topics " + dateFormat.format(new Date())); topics.fillTopics(train); System.out.println("Preparing topics topics " + dateFormat.format(new Date())); // Aquí tengo que serializar antes de preparar, porque si no no puedo calcular los tf y // tfidf System.out.println("Serializing topics topics " + dateFormat.format(new Date())); /* if (clonar) { TopicsSerialization.toFile(cloneID+".topics", topics); } */ topics.prepareTopics(); TopicsSerialization.toFile(id + ".topics", topics); } System.out.println("3-Generate arff train file " + dateFormat.format(new Date())); out.println("3-Generate arff train file " + dateFormat.format(new Date())); // Si el fichero arff no existe, lo creo. en caso contrario vengo haciendo lo que hasta ahora, // aprovechar trabajo previo if (!new File(id + "-train.arff").exists()) { BufferedWriter bw = topics.generateArffHeader(id + "-train.arff"); int tope = traincorpus.size(); if (tweetsProcessor == null) tweetsProcessor = new ProcessTweets( stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); for (int indTweet = 0; indTweet < tope; indTweet++) { topics.generateArffVector(bw, train.get(indTweet)); } bw.flush(); bw.close(); } // Ahora proceso los datos de test System.out.println("5-build test dataset " + dateFormat.format(new Date())); out.println("5-build test dataset " + dateFormat.format(new Date())); List<ProcessedTweet> test = null; if (new File(id + "-test.ptweets").exists()) test = ProcessedTweetSerialization.fromFile(id + "-test.ptweets"); else { if (tweetsProcessor == null) tweetsProcessor = new ProcessTweets( stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); test = tweetsProcessor.processTweets(testcorpus); ProcessedTweetSerialization.toFile(id + "-test.ptweets", test); /* if (clonar) { File f = new File (id+"-test.ptweets"); Path p = f.toPath(); CopyOption[] options = new CopyOption[]{ StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.COPY_ATTRIBUTES }; Files.copy(p, new File (cloneID+"-test.ptweets").toPath(), options); } */ } // Si el fichero arff no existe, lo creo. en caso contrario vengo haciendo lo que hasta ahora, // aprovechar trabajo previo if (!new File(id + "-test.arff").exists()) { BufferedWriter bw = topics.generateArffHeader(id + "-test.arff"); int tope = testcorpus.size(); if (tweetsProcessor == null) tweetsProcessor = new ProcessTweets( stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); for (int indTweet = 0; indTweet < tope; indTweet++) { topics.generateArffVector(bw, test.get(indTweet)); } bw.flush(); bw.close(); } int topeTopics = topics.getTopicsList().size(); topics.getTopicsList().clear(); // Genero el clasificador // FJRM 25-08-2013 Lo cambio de orden para intentar liberar la memoria de los topics y tener // más libre System.out.println("4-Generate classifier " + dateFormat.format(new Date())); out.println("4-Generate classifier " + dateFormat.format(new Date())); Classifier cls = null; DataSource sourceTrain = null; Instances dataTrain = null; if (new File(id + "-MNB.classifier").exists()) { ObjectInputStream ois = new ObjectInputStream(new FileInputStream(id + "-MNB.classifier")); cls = (Classifier) ois.readObject(); ois.close(); } else { sourceTrain = new DataSource(id + "-train.arff"); dataTrain = sourceTrain.getDataSet(); if (dataTrain.classIndex() == -1) dataTrain.setClassIndex(dataTrain.numAttributes() - 1); // Entreno el clasificador cls = new weka.classifiers.bayes.NaiveBayesMultinomial(); int clase = dataTrain.numAttributes() - 1; dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(id + "-MNB.classifier")); oos.writeObject(cls); oos.flush(); oos.close(); // data.delete();//no borro para el svm } // Ahora evaluo el clasificador con los datos de test System.out.println("6-Evaluate classifier MNB " + dateFormat.format(new Date())); out.println("6-Evaluate classifier MNB" + dateFormat.format(new Date())); DataSource sourceTest = new DataSource(id + "-test.arff"); Instances dataTest = sourceTest.getDataSet(); int clase = dataTest.numAttributes() - 1; dataTest.setClassIndex(clase); Evaluation eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); // Ahora calculo los valores precision, recall y fmeasure. Además saco las matrices de // confusion precision = 0; recall = 0; fmeasure = 0; for (int ind = 0; ind < topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); /* NO BORRAR System.out.println("7-Evaluate classifier SVM"+dateFormat.format(new Date())); out.println("7-Evaluate classifier SVM"+dateFormat.format(new Date())); if (new File(id+"-SVM.classifier").exists()) { ObjectInputStream ois = new ObjectInputStream(new FileInputStream(id+"-SVM.classifier")); cls = (Classifier) ois.readObject(); ois.close(); } else { if (dataTrain==null) { sourceTrain = new DataSource(id+"-train.arff"); dataTrain = sourceTrain.getDataSet(); if (dataTrain.classIndex() == -1) dataTrain.setClassIndex(dataTrain.numAttributes() - 1); } //Entreno el clasificador cls = new weka.classifiers.functions.LibSVM(); clase = dataTrain.numAttributes()-1; dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(id+"-SVM.classifier")); oos.writeObject(cls); oos.flush(); oos.close(); dataTrain.delete(); } eval.evaluateModel(cls, dataTest); precision=0; recall=0; fmeasure=0; for(int ind=0; ind<topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); */ System.out.println("Done " + dateFormat.format(new Date())); out.println("Done " + dateFormat.format(new Date())); if (printer == null) { out.flush(); out.close(); } // Intento de liberar memoria if (dataTrain != null) dataTrain.delete(); if (dataTest != null) dataTest.delete(); if (train != null) train.clear(); if (test != null) test.clear(); if (topics != null) { topics.getTopicsList().clear(); topics = null; } if (dataTest != null) dataTest.delete(); if (cls != null) cls = null; if (tweetsProcessor != null) tweetsProcessor = null; System.gc(); } catch (Exception e) { e.printStackTrace(); } }
/** * Removes a specified number of instances from the given set of instances. * * @param data given instances * @param numRemove number of instances to delete from the given instances */ protected void removeInstances(Instances data, int numRemove) { int num = data.numInstances(); for (int i = num - 1; i > num - 1 - numRemove; i--) { data.delete(i); } }
/** * Calculates the class membership probabilities for the given test instance. * * @param instance the instance to be classified * @return predicted class probability distribution * @throws Exception if an error occurred during the prediction */ public double[] distributionForInstance(Instance instance) throws Exception { NaiveBayes nb = new NaiveBayes(); // System.out.println("number of instances "+m_Train.numInstances()); if (m_Train.numInstances() == 0) { // throw new Exception("No training instances!"); return m_defaultModel.distributionForInstance(instance); } if ((m_WindowSize > 0) && (m_Train.numInstances() > m_WindowSize)) { m_kNNValid = false; boolean deletedInstance = false; while (m_Train.numInstances() > m_WindowSize) { m_Train.delete(0); } // rebuild datastructure KDTree currently can't delete if (deletedInstance == true) m_NNSearch.setInstances(m_Train); } // Select k by cross validation if (!m_kNNValid && (m_CrossValidate) && (m_kNNUpper >= 1)) { crossValidate(); } m_NNSearch.addInstanceInfo(instance); m_kNN = 1000; Instances neighbours = m_NNSearch.kNearestNeighbours(instance, m_kNN); double[] distances = m_NNSearch.getDistances(); // System.out.println("--------------classify instance--------- "); // System.out.println("neighbours.numInstances"+neighbours.numInstances()); // System.out.println("distances.length"+distances.length); // System.out.println("--------------classify instance--------- "); /* for (int k = 0; k < distances.length; k++) { //System.out.println("-------"); //System.out.println("distance of "+k+" "+distances[k]); //System.out.println("instance of "+k+" "+neighbours.instance(k)); //distances[k] = distances[k]+0.1; //System.out.println("------- after add 0.1"); //System.out.println("distance of "+k+" "+distances[k]); } */ Instances instances = new Instances(m_Train); // int attrnum = instances.numAttributes(); instances.deleteWithMissingClass(); Instances newm_Train = new Instances(instances, 0, instances.numInstances()); for (int k = 0; k < neighbours.numInstances(); k++) { // System.out.println("-------"); // Instance in = new Instance(); Instance insk = neighbours.instance(k); // System.out.println("instance "+k+" "+neighbours.instance(k)); // System.out.println("-------"); double dis = distances[k] + 0.1; // System.out.println("dis "+dis); dis = (1 / dis) * 10; // System.out.println("1/dis "+dis); int weightnum = (int) dis; // System.out.println("weightnum "+weightnum); for (int s = 0; s < weightnum; s++) { newm_Train.add(insk); } } // System.out.println("number of instances "+newm_Train.numInstances()); /* for (int k = 0; k < newm_Train.numInstances(); k++) { System.out.println("-------"); System.out.println("instance "+k+" "+newm_Train.instance(k)); System.out.println("-------"); } /* for (int k = 0; k < distances.length; k++) { System.out.println("-------"); System.out.println("distance of "+k+" "+distances[k]); System.out.println("-------"); }*/ nb.buildClassifier(newm_Train); double[] dis = nb.distributionForInstance(instance); // double[] distribution = makeDistribution(neighbours, distances); return dis; // return distribution; }