// 计算h1,h2分类器共同的分类错误率; public double measureBothError(Classifier h1, Classifier h2, Instances test) { int m = test.numInstances(); double value1, value2, value; int error = 0, total = 0; try { for (int i = 0; i < m; i++) { value = test.instance(i).classValue(); value1 = h1.classifyInstance(test.instance(i)); value2 = h2.classifyInstance(test.instance(i)); // 两分类器做出相同决策 if (value1 == value2) { // 两分类器做出相同决策的样本数量 total++; // 两分类器做出相同错误决策 if (value != value1) { // 两分类器做出相同错误决策的样本数量 error++; } } } } catch (Exception e) { System.out.println(e); } // System.out.println("m:=" + m); // System.out.println("error:=" + error +"; total:=" + total); // 两个分类器的分类错误率= 两分类器做出相同错误决策的样本数量/两分类器做出相同决策的样本数量 return (error * 1.0) / total; }
public static void main(String[] args) throws Exception { /* * First we load the test data from our ARFF file */ ArffLoader testLoader = new ArffLoader(); testLoader.setSource(new File("data/titanic/test.arff")); testLoader.setRetrieval(Loader.BATCH); Instances testDataSet = testLoader.getDataSet(); /* * Now we tell the data set which attribute we want to classify, in our * case, we want to classify the first column: survived */ Attribute testAttribute = testDataSet.attribute(0); testDataSet.setClass(testAttribute); testDataSet.deleteStringAttributes(); /* * Now we read in the serialized model from disk */ Classifier classifier = (Classifier) SerializationHelper.read("data/titanic/titanic.model"); /* * This part may be a little confusing. We load up the test data again * so we have a prediction data set to populate. As we iterate over the * first data set we also iterate over the second data set. After an * instance is classified, we set the value of the prediction data set * to be the value of the classification */ ArffLoader test1Loader = new ArffLoader(); test1Loader.setSource(new File("data/titanic/test.arff")); Instances test1DataSet = test1Loader.getDataSet(); Attribute test1Attribute = test1DataSet.attribute(0); test1DataSet.setClass(test1Attribute); /* * Now we iterate over the test data and classify each entry and set the * value of the 'survived' column to the result of the classification */ Enumeration testInstances = testDataSet.enumerateInstances(); Enumeration test1Instances = test1DataSet.enumerateInstances(); while (testInstances.hasMoreElements()) { Instance instance = (Instance) testInstances.nextElement(); Instance instance1 = (Instance) test1Instances.nextElement(); double classification = classifier.classifyInstance(instance); instance1.setClassValue(classification); } /* * Now we want to write out our predictions. The resulting file is in a * format suitable to submit to Kaggle. */ CSVSaver predictedCsvSaver = new CSVSaver(); predictedCsvSaver.setFile(new File("data/titanic/predict.csv")); predictedCsvSaver.setInstances(test1DataSet); predictedCsvSaver.writeBatch(); System.out.println("Prediciton saved to predict.csv"); }
private static int calculatenpeople(Classifier ibkregression, Instance instregression) { Double predictValue = null; try { predictValue = ibkregression.classifyInstance(instregression); } catch (Exception e) { e.printStackTrace(); } return predictValue.intValue(); }
// 通过h1,h2分类器学习样本集,将h1,h2分类决策相同的样本放入L中,得到标记集合; public void updateL(Classifier h1, Classifier h2, Instances L, Instances test) { int length = unlabeledIns.numInstances(); double value1 = 0.0, value2 = 0.0; try { for (int i = 0; i < length; i++) { value1 = h1.classifyInstance(test.instance(i)); value2 = h2.classifyInstance(test.instance(i)); if (value1 == value2) { // 当两个分类器做出相同决策时重新标记样本的类别; test.instance(i).setClassValue(value1); L.add(test.instance(i)); } } } catch (Exception e) { System.out.println(e); } // return false; }
/** 分类过程 */ public double classifyMessage(String message) throws Exception { filter.input(makeInstance(message, instances.stringFreeStructure())); Instance filteredInstance = filter.output(); // 必须使用原来的filter double predicted = classifier.classifyInstance(filteredInstance); // (int)predicted是类标索引 // System.out.println("Message classified as : " // + instances.classAttribute().value((int) predicted)); return predicted; }
public double getAccuracy(Classifier classifier, Instances test) throws Exception { for (Instance instance : test) { int predClass = (int) classifier.classifyInstance(instance); int realClass = (int) instance.classValue(); if (predClass == realClass) { corrCnt++; } } return (double) corrCnt / totCnt; }
public int SelectRow_KLDivergenceMisclassified( Instances pool, Classifier myEstimator, int desiredAttr) { // for each instance with unbought desiredAttr and label = desiredLabel // measure KL-divergence (relative entropy between two prob distributions): // KL(P||Q) = sum_i p_i log (p_i/q_i) // withr respect to Q = Uniform, we have // KL(P||U) = sum_i p_i log(p_i) // choose (row) that is minimum (i.e. closest to uniform) int numInstances = pool.numInstances(); double[] KLDivs = new double[numInstances]; boolean[] isValidInstance = new boolean[numInstances]; boolean misclassified = false; double[] probs = null; Instance inst; for (int i = 0; i < numInstances; i++) { inst = pool.instance(i); try { if (inst.classValue() != myEstimator.classifyInstance(inst)) misclassified = true; else misclassified = false; } catch (Exception e1) { // TODO Auto-generated catch block e1.printStackTrace(); } if (inst.isMissing(desiredAttr) && misclassified) { try { probs = myEstimator.distributionForInstance(inst); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } for (int j = 0; j < probs.length; j++) KLDivs[i] += MyXLogX(probs[j]); isValidInstance[i] = true; } else { KLDivs[i] = Double.MAX_VALUE; isValidInstance[i] = false; } } double leastDivergence = KLDivs[Utils.minIndex(KLDivs)]; int numLeastDivs = 0; for (int i = 0; i < numInstances; i++) if (isValidInstance[i] && KLDivs[i] == leastDivergence) numLeastDivs++; int randomInstance = r.nextInt(numLeastDivs); int index = 0; for (int i = 0; i < numInstances; i++) { if (isValidInstance[i] && KLDivs[i] == leastDivergence) { if (index == randomInstance) return i; else index++; } } return -1; }
private static boolean calcultateifhotspot( Classifier ibkclassification, Instance instclassification) { Double predictValue = null; try { predictValue = ibkclassification.classifyInstance(instclassification); } catch (Exception e) { e.printStackTrace(); } if (predictValue == 0.0) return true; else return false; }
public void classify(String filename) throws Exception { Instances unLabeledData = DataSource.read(filename); unLabeledData.setClassIndex(unLabeledData.numAttributes() - 1); Instances LabeledData = new Instances(unLabeledData); for (int i = 0; i < unLabeledData.numInstances(); ++i) { double clsLabel = classifier.classifyInstance(unLabeledData.instance(i)); LabeledData.instance(i).setClassValue(clsLabel); } System.out.println(LabeledData.toString()); }
@Override public double classifyInstance(Instance instance) throws Exception { double sum = 0.0; for (Classifier classifier : classifiers) { double classification = classifier.classifyInstance(instance); sum += classification; } if (sum >= classifiers.size() / 2) { return 1.0; } return 0.0; }
public String classifyInstance(Instance wekaInstance) { String label = null; try { double labelIndex = bayesNet.classifyInstance(wekaInstance); wekaInstance.setClassValue(labelIndex); label = wekaInstance.toString(wekaInstance.classIndex()); } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); System.exit(-1); } return label; }
/** * classifies a tweet * * @param stringa the tweet to classify * @return the tweet polarity */ public double classifyDouble(String stringa) throws FileNotFoundException, IOException, Exception { String string_new; Preprocesser pre = new Preprocesser(); string_new = pre.preprocessDocument(stringa); String tmp = ""; StringTokenizer st = new StringTokenizer(string_new, " "); // Instances unlabeled = new Instances(new BufferedReader(new // FileReader("D:/prova.arff"))); Instances unlabeled = new Instances(_train, 1); Instance inst = new Instance(unlabeled.numAttributes()); // load unlabeled data inst.setDataset(unlabeled); int j = 0; while (j < unlabeled.numAttributes()) { inst.setValue(j, "0"); j++; } while (st.hasMoreTokens()) { tmp = st.nextToken(); if (unlabeled.attribute(tmp) != null) inst.setValue(unlabeled.attribute(tmp), "1"); } unlabeled.add(inst); // set class attribute unlabeled.setClassIndex(unlabeled.numAttributes() - 1); // create copy Instances labeled = new Instances(unlabeled); // label instances for (int i = 0; i < unlabeled.numInstances(); i++) { double clsLabel = _cl.classifyInstance(unlabeled.instance(i)); labeled.instance(i).setClassValue(clsLabel); } // return labeled.instance(0).stringValue(unlabeled.numAttributes()-1); // System.out.println("weight: " + labeled.instance(0).weight()); return labeled.instance(0).classValue(); }
/** * Classifies a given instance using the selected classifier. * * @param instance the instance to be classified * @exception Exception if instance could not be classified successfully */ public double classifyInstance(Instance instance) throws Exception { return m_Classifier.classifyInstance(instance); }
public static void main(String[] args) throws Exception { // NaiveBayesSimple nb = new NaiveBayesSimple(); // BufferedReader br_train = new BufferedReader(new FileReader("src/train.arff.txt")); // String s = null; // long st_time = System.currentTimeMillis(); // Instances inst_train = new Instances(br_train); // System.out.println(inst_train.numAttributes()); // inst_train.setClassIndex(inst_train.numAttributes()-1); // System.out.println("train time"+(System.currentTimeMillis()-st_time)); // NaiveBayes nb1 = new NaiveBayes(); // nb1.buildClassifier(inst_train); // br_train.close(); long st_time = System.currentTimeMillis(); st_time = System.currentTimeMillis(); Classifier classifier = (Classifier) SerializationHelper.read("NaiveBayes.model"); // BufferedReader br_test = new BufferedReader(new FileReader("src/test.arff.txt")); // Instances inst_test = new Instances(br_test); // inst_test.setClassIndex(inst_test.numAttributes()-1); // System.out.println("test time"+(System.currentTimeMillis()-st_time)); // ArffLoader testLoader = new ArffLoader(); testLoader.setSource(new File("src/test.arff")); testLoader.setRetrieval(Loader.BATCH); Instances testDataSet = testLoader.getDataSet(); Attribute testAttribute = testDataSet.attribute("class"); testDataSet.setClass(testAttribute); int correct = 0; int incorrect = 0; FastVector attInfo = new FastVector(); attInfo.addElement(new Attribute("Id")); attInfo.addElement(new Attribute("Category")); Instances outputInstances = new Instances("predict", attInfo, testDataSet.numInstances()); Enumeration testInstances = testDataSet.enumerateInstances(); int index = 1; while (testInstances.hasMoreElements()) { Instance instance = (Instance) testInstances.nextElement(); double classification = classifier.classifyInstance(instance); Instance predictInstance = new Instance(outputInstances.numAttributes()); predictInstance.setValue(0, index++); predictInstance.setValue(1, (int) classification + 1); outputInstances.add(predictInstance); } System.out.println("Correct Instance: " + correct); System.out.println("IncCorrect Instance: " + incorrect); double accuracy = (double) (correct) / (double) (correct + incorrect); System.out.println("Accuracy: " + accuracy); CSVSaver predictedCsvSaver = new CSVSaver(); predictedCsvSaver.setFile(new File("predict.csv")); predictedCsvSaver.setInstances(outputInstances); predictedCsvSaver.writeBatch(); System.out.println("Prediciton saved to predict.csv"); }
// 输入问题,输出问题所属类型。 public double classifyByBayes(String question) throws Exception { double label = -1; List<Question> questionID = questionDAO.getQuestionIDLabeled(); // 定义数据格式 Attribute att1 = new Attribute("法律政策"); Attribute att2 = new Attribute("位置交通"); Attribute att3 = new Attribute("风水"); Attribute att4 = new Attribute("房价"); Attribute att5 = new Attribute("楼层"); Attribute att6 = new Attribute("户型"); Attribute att7 = new Attribute("小区配套"); Attribute att8 = new Attribute("贷款"); Attribute att9 = new Attribute("买房时机"); Attribute att10 = new Attribute("开发商"); FastVector labels = new FastVector(); labels.addElement("1"); labels.addElement("2"); labels.addElement("3"); labels.addElement("4"); labels.addElement("5"); labels.addElement("6"); labels.addElement("7"); labels.addElement("8"); labels.addElement("9"); labels.addElement("10"); Attribute att11 = new Attribute("类别", labels); FastVector attributes = new FastVector(); attributes.addElement(att1); attributes.addElement(att2); attributes.addElement(att3); attributes.addElement(att4); attributes.addElement(att5); attributes.addElement(att6); attributes.addElement(att7); attributes.addElement(att8); attributes.addElement(att9); attributes.addElement(att10); attributes.addElement(att11); Instances dataset = new Instances("Test-dataset", attributes, 0); dataset.setClassIndex(10); Classifier classifier = null; if (!new File("naivebayes.model").exists()) { // 添加数据 double[] values = new double[11]; for (int i = 0; i < questionID.size(); i++) { for (int m = 0; m < 11; m++) { values[m] = 0; } int whitewordcount = 0; whitewordcount = questionDAO.getHitWhiteWordNum(questionID.get(i).getId()); if (whitewordcount != 0) { List<QuestionWhiteWord> questionwhiteword = questionDAO.getHitQuestionWhiteWord(questionID.get(i).getId()); for (int j = 0; j < questionwhiteword.size(); j++) { values[getAttIndex(questionwhiteword.get(j).getWordId()) - 1]++; } for (int m = 0; m < 11; m++) { values[m] = values[m] / whitewordcount; } } values[10] = questionID.get(i).getType() - 1; Instance inst = new Instance(1.0, values); dataset.add(inst); } // 构造分类器 classifier = new NaiveBayes(); classifier.buildClassifier(dataset); SerializationHelper.write("naivebayes.model", classifier); } else { classifier = (Classifier) SerializationHelper.read("naivebayes.model"); } System.out.println("*************begin evaluation*******************"); Evaluation evaluation = new Evaluation(dataset); evaluation.evaluateModel(classifier, dataset); // 按道理说,这里应该使用另一份数据,而不是训练集data。 System.out.println(evaluation.toSummaryString()); // 分类 System.out.println("*************begin classification*******************"); Instance subject = new Instance(1.0, getQuestionVector(question)); subject.setDataset(dataset); label = classifier.classifyInstance(subject); System.out.println("label: " + label); // double dis[]=classifier.distributionForInstance(inst); // for(double i:dis){ // System.out.print(i+" "); // } System.out.println(questionID.size()); return label + 1; }
/** * Classifies a given instance. * * @param instance the instance to be classified * @return index of the predicted class * @throws Exception if an error occurred during the prediction */ public double classifyInstance(Instance instance) throws Exception { if (m_GroovyObject != null) return m_GroovyObject.classifyInstance(instance); else return Utils.missingValue(); }
/** * Cleanses the data based on misclassifications when used training data. * * @param data the data to train with and cleanse * @return the cleansed data * @throws Exception if something goes wrong */ private Instances cleanseTrain(Instances data) throws Exception { Instance inst; Instances buildSet = new Instances(data); Instances temp = new Instances(data, data.numInstances()); Instances inverseSet = new Instances(data, data.numInstances()); int count = 0; double ans; int iterations = 0; int classIndex = m_classIndex; if (classIndex < 0) { classIndex = data.classIndex(); } if (classIndex < 0) { classIndex = data.numAttributes() - 1; } // loop until perfect while (count != buildSet.numInstances()) { // check if hit maximum number of iterations iterations++; if (m_numOfCleansingIterations > 0 && iterations > m_numOfCleansingIterations) { break; } // build classifier count = buildSet.numInstances(); buildSet.setClassIndex(classIndex); m_cleansingClassifier.buildClassifier(buildSet); temp = new Instances(buildSet, buildSet.numInstances()); // test on training data for (int i = 0; i < buildSet.numInstances(); i++) { inst = buildSet.instance(i); ans = m_cleansingClassifier.classifyInstance(inst); if (buildSet.classAttribute().isNumeric()) { if (ans >= inst.classValue() - m_numericClassifyThreshold && ans <= inst.classValue() + m_numericClassifyThreshold) { temp.add(inst); } else if (m_invertMatching) { inverseSet.add(inst); } } else { // class is nominal if (ans == inst.classValue()) { temp.add(inst); } else if (m_invertMatching) { inverseSet.add(inst); } } } buildSet = temp; } if (m_invertMatching) { inverseSet.setClassIndex(data.classIndex()); return inverseSet; } else { buildSet.setClassIndex(data.classIndex()); return buildSet; } }
/** * @param args the command line arguments * @throws Exception */ public static void main(String[] args) throws Exception { PreProcessor p = new PreProcessor("census-income.data", "census-income-preprocessed.arff"); p.smote(); PreProcessor p_test = new PreProcessor("census-income.test", "census-income-test-preprocessed.arff"); p_test.run(); BufferedReader traindata = new BufferedReader(new FileReader("census-income-preprocessed.arff")); BufferedReader testdata = new BufferedReader(new FileReader("census-income-test-preprocessed.arff")); Instances traininstance = new Instances(traindata); Instances testinstance = new Instances(testdata); traindata.close(); testdata.close(); traininstance.setClassIndex(traininstance.numAttributes() - 1); testinstance.setClassIndex(testinstance.numAttributes() - 1); int numOfAttributes = testinstance.numAttributes(); int numOfInstances = testinstance.numInstances(); NaiveBayesClassifier nb = new NaiveBayesClassifier("census-income-preprocessed.arff"); Classifier cnaive = nb.NBClassify(); DecisionTree dt = new DecisionTree("census-income-preprocessed.arff"); Classifier cls = dt.DTClassify(); AdaBoost ab = new AdaBoost("census-income-preprocessed.arff"); AdaBoostM1 m1 = ab.AdaBoostDTClassify(); BaggingMethod b = new BaggingMethod("census-income-preprocessed.arff"); Bagging bag = b.BaggingDTClassify(); SVM s = new SVM("census-income-preprocessed.arff"); SMO svm = s.SMOClassifier(); knn knnclass = new knn("census-income-preprocessed.arff"); IBk knnc = knnclass.knnclassifier(); Logistic log = new Logistic(); log.buildClassifier(traininstance); int match = 0; int error = 0; int greater = 0; int less = 0; for (int i = 0; i < numOfInstances; i++) { String predicted = ""; greater = 0; less = 0; double predictions[] = new double[8]; double pred = cls.classifyInstance(testinstance.instance(i)); predictions[0] = pred; double abpred = m1.classifyInstance(testinstance.instance(i)); predictions[1] = abpred; double naivepred = cnaive.classifyInstance(testinstance.instance(i)); predictions[2] = naivepred; double bagpred = bag.classifyInstance(testinstance.instance(i)); predictions[3] = bagpred; double smopred = svm.classifyInstance(testinstance.instance(i)); predictions[4] = smopred; double knnpred = knnc.classifyInstance(testinstance.instance(i)); predictions[5] = knnpred; for (int j = 0; j < 6; j++) { if ((testinstance.instance(i).classAttribute().value((int) predictions[j])) .compareTo(">50K") == 0) greater++; else less++; } if (greater > less) predicted = ">50K"; else predicted = "<=50K"; if ((testinstance.instance(i).stringValue(numOfAttributes - 1)).compareTo(predicted) == 0) match++; else error++; } System.out.println("Correctly classified Instances: " + match); System.out.println("Misclassified Instances: " + error); double accuracy = (double) match / (double) numOfInstances * 100; double error_percent = 100 - accuracy; System.out.println("Accuracy: " + accuracy + "%"); System.out.println("Error: " + error_percent + "%"); }
/** * Cleanses the data based on misclassifications when performing cross-validation. * * @param data the data to train with and cleanse * @return the cleansed data * @throws Exception if something goes wrong */ private Instances cleanseCross(Instances data) throws Exception { Instance inst; Instances crossSet = new Instances(data); Instances temp = new Instances(data, data.numInstances()); Instances inverseSet = new Instances(data, data.numInstances()); int count = 0; double ans; int iterations = 0; int classIndex = m_classIndex; if (classIndex < 0) { classIndex = data.classIndex(); } if (classIndex < 0) { classIndex = data.numAttributes() - 1; } // loop until perfect while (count != crossSet.numInstances() && crossSet.numInstances() >= m_numOfCrossValidationFolds) { count = crossSet.numInstances(); // check if hit maximum number of iterations iterations++; if (m_numOfCleansingIterations > 0 && iterations > m_numOfCleansingIterations) { break; } crossSet.setClassIndex(classIndex); if (crossSet.classAttribute().isNominal()) { crossSet.stratify(m_numOfCrossValidationFolds); } // do the folds temp = new Instances(crossSet, crossSet.numInstances()); for (int fold = 0; fold < m_numOfCrossValidationFolds; fold++) { Instances train = crossSet.trainCV(m_numOfCrossValidationFolds, fold); m_cleansingClassifier.buildClassifier(train); Instances test = crossSet.testCV(m_numOfCrossValidationFolds, fold); // now test for (int i = 0; i < test.numInstances(); i++) { inst = test.instance(i); ans = m_cleansingClassifier.classifyInstance(inst); if (crossSet.classAttribute().isNumeric()) { if (ans >= inst.classValue() - m_numericClassifyThreshold && ans <= inst.classValue() + m_numericClassifyThreshold) { temp.add(inst); } else if (m_invertMatching) { inverseSet.add(inst); } } else { // class is nominal if (ans == inst.classValue()) { temp.add(inst); } else if (m_invertMatching) { inverseSet.add(inst); } } } } crossSet = temp; } if (m_invertMatching) { inverseSet.setClassIndex(data.classIndex()); return inverseSet; } else { crossSet.setClassIndex(data.classIndex()); return crossSet; } }
public HashMap<String, String> process( Sentence sent, String dep, HashSet<String> terms, List<NamedEntity> entities, String author, String aidx) { try { // System.out.println("ML start!"); // System.out.println("List : " + terms); HashMap<String, String> ht = new HashMap<String, String>(); List<NamedEntity> newEntities = new ArrayList<NamedEntity>(); for (NamedEntity entity : entities) { // System.out.println("original: " + entity.entity); boolean check = false; for (NamedEntity temp : entities) { if (entity == temp) continue; if (entity.entity.contains(temp.entity)) { check = true; } } if (!check) newEntities.add(entity); } List<DependencyTriple> dtl = getDependencyTripleList(dep); List<NamedEntity> targetCands = new ArrayList<NamedEntity>(); HashMap<NamedEntity, String> tOpinTerm = new HashMap<NamedEntity, String>(); List<NamedEntity> holderCands = new ArrayList<NamedEntity>(); HashMap<NamedEntity, String> hOpinTerm = new HashMap<NamedEntity, String>(); BufferedWriter writer = new BufferedWriter(new FileWriter("weka_target.csv")); writer.write("A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,Class\n"); boolean check = false; List<NamedEntity> targetTmp = new ArrayList<NamedEntity>(); for (NamedEntity entity : newEntities) { // System.out.println("extracted: " + entity.entity); String temp = getTargetFeatures(entity, author, terms, dtl); // System.out.println(temp); if (temp.length() > 1) { check = true; writer.write(temp); String[] toks = temp.split("\n"); for (int i = 0; i < toks.length; i++) { targetTmp.add(entity); tOpinTerm.put(entity, toks[i].substring(0, toks[i].indexOf(","))); } } } writer.close(); if (check) { DataSource source = new DataSource("weka_target.csv"); Instances testdata = source.getDataSet(); testdata.setClassIndex(testdata.numAttributes() - 1); Classifier models = (Classifier) weka.core.SerializationHelper.read("target_smoreg.model"); if (testdata.numInstances() != targetTmp.size()) System.out.println("wrong number of instances"); for (int i = 0; i < testdata.numInstances(); i++) { double pred = models.classifyInstance(testdata.instance(i)); if (pred >= 1.0) { // System.out.println(pred + " , " + targetTmp.get(i).entity); targetCands.add(targetTmp.get(i)); } } } writer = new BufferedWriter(new FileWriter("weka_holder.csv")); writer.write("A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,Class\n"); check = false; List<NamedEntity> holderTmp = new ArrayList<NamedEntity>(); for (NamedEntity entity : newEntities) { // System.out.println("extracted: " + entity.entity); String temp = getHolderFeatures(entity, author, terms, dtl); // System.out.println(temp); if (temp.length() > 1) { check = true; writer.write(temp); String[] toks = temp.split("\n"); for (int i = 0; i < toks.length; i++) { holderTmp.add(entity); hOpinTerm.put(entity, toks[i].substring(0, toks[i].indexOf(","))); } } } writer.close(); if (check) { DataSource source = new DataSource("weka_holder.csv"); Instances testdata = source.getDataSet(); testdata.setClassIndex(testdata.numAttributes() - 1); Classifier models = (Classifier) weka.core.SerializationHelper.read("holder_smoreg.model"); if (testdata.numInstances() != holderTmp.size()) System.out.println("wrong number of instances"); for (int i = 0; i < testdata.numInstances(); i++) { double pred = models.classifyInstance(testdata.instance(i)); if (pred >= 1.0) { // System.out.println(pred + " , " + holderTmp.get(i).entity); holderCands.add(holderTmp.get(i)); } } } if ((targetCands.size() == 0) || (holderCands.size() == 0)) return ht; List<NamedEntity> holderCandTmp = new ArrayList<NamedEntity>(); for (NamedEntity holderCand : holderCands) { boolean hasLonger = false; for (NamedEntity temp : holderCands) { if (temp.entity.compareTo(holderCand.entity) == 0) continue; if (temp.entity.contains(holderCand.entity)) { hasLonger = true; break; } } if (!hasLonger) holderCandTmp.add(holderCand); } List<NamedEntity> targetCandTmp = new ArrayList<NamedEntity>(); for (NamedEntity targetCand : targetCands) { boolean hasLonger = false; for (NamedEntity temp : targetCands) { if (temp.entity.compareTo(targetCand.entity) == 0) continue; if (temp.entity.contains(targetCand.entity)) { hasLonger = true; break; } } if (!hasLonger) targetCandTmp.add(targetCand); } for (NamedEntity targetCand : targetCandTmp) { if (targetCand.entity.compareTo(author) == 0) continue; for (NamedEntity holderCand : holderCandTmp) { if (targetCand.entity.compareTo(holderCand.entity) == 0) continue; String targetOpin = tOpinTerm.get(targetCand); String holderOpin = hOpinTerm.get(holderCand); // System.out.println(targetOpin + ", " + holderOpin); if (targetOpin.compareTo(holderOpin) != 0) continue; String opin = targetOpin .concat("\t") .concat( Integer.toString(sent.sent.indexOf(targetOpin) + sent.beg) .concat("-") .concat( Integer.toString( sent.sent.indexOf(targetOpin) + sent.beg + targetOpin.length()))); String holder = holderCand .entity .concat("\t") .concat( Integer.toString(holderCand.beg) .concat("-") .concat(Integer.toString(holderCand.end))); String target = targetCand .entity .concat("\t") .concat( Integer.toString(targetCand.beg) .concat("-") .concat(Integer.toString(targetCand.end))); ht.put(targetOpin, opin.concat("\t").concat(holder).concat("\t").concat(target)); } } return ht; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } }