Java Classifier.classifyInstanceの例、weka.classifiers.Classifier.classifyInstance Javaの例

コード例 #1

0

ファイルを表示

ファイル: Tritrainer.java プロジェクト: xuleiboy1234/HyREX

  //	计算h1,h2分类器共同的分类错误率；
  public double measureBothError(Classifier h1, Classifier h2, Instances test) {
    int m = test.numInstances();
    double value1, value2, value;
    int error = 0, total = 0;
    try {
      for (int i = 0; i < m; i++) {
        value = test.instance(i).classValue();
        value1 = h1.classifyInstance(test.instance(i));
        value2 = h2.classifyInstance(test.instance(i));

        // 两分类器做出相同决策
        if (value1 == value2) {
          // 两分类器做出相同决策的样本数量
          total++;

          // 两分类器做出相同错误决策
          if (value != value1) {
            //	两分类器做出相同错误决策的样本数量
            error++;
          }
        }
      }
    } catch (Exception e) {
      System.out.println(e);
    }
    // System.out.println("m:=" + m);
    // System.out.println("error:=" + error +"; total:=" + total);

    // 两个分类器的分类错误率= 两分类器做出相同错误决策的样本数量/两分类器做出相同决策的样本数量
    return (error * 1.0) / total;
  }

コード例 #2

0

ファイルを表示

ファイル: Predict.java プロジェクト: peizhe/kaggle-knowledge

  public static void main(String[] args) throws Exception {

    /*
     * First we load the test data from our ARFF file
     */
    ArffLoader testLoader = new ArffLoader();
    testLoader.setSource(new File("data/titanic/test.arff"));
    testLoader.setRetrieval(Loader.BATCH);
    Instances testDataSet = testLoader.getDataSet();

    /*
     * Now we tell the data set which attribute we want to classify, in our
     * case, we want to classify the first column: survived
     */
    Attribute testAttribute = testDataSet.attribute(0);
    testDataSet.setClass(testAttribute);
    testDataSet.deleteStringAttributes();

    /*
     * Now we read in the serialized model from disk
     */
    Classifier classifier = (Classifier) SerializationHelper.read("data/titanic/titanic.model");

    /*
     * This part may be a little confusing. We load up the test data again
     * so we have a prediction data set to populate. As we iterate over the
     * first data set we also iterate over the second data set. After an
     * instance is classified, we set the value of the prediction data set
     * to be the value of the classification
     */
    ArffLoader test1Loader = new ArffLoader();
    test1Loader.setSource(new File("data/titanic/test.arff"));
    Instances test1DataSet = test1Loader.getDataSet();
    Attribute test1Attribute = test1DataSet.attribute(0);
    test1DataSet.setClass(test1Attribute);

    /*
     * Now we iterate over the test data and classify each entry and set the
     * value of the 'survived' column to the result of the classification
     */
    Enumeration testInstances = testDataSet.enumerateInstances();
    Enumeration test1Instances = test1DataSet.enumerateInstances();
    while (testInstances.hasMoreElements()) {
      Instance instance = (Instance) testInstances.nextElement();
      Instance instance1 = (Instance) test1Instances.nextElement();
      double classification = classifier.classifyInstance(instance);
      instance1.setClassValue(classification);
    }

    /*
     * Now we want to write out our predictions. The resulting file is in a
     * format suitable to submit to Kaggle.
     */
    CSVSaver predictedCsvSaver = new CSVSaver();
    predictedCsvSaver.setFile(new File("data/titanic/predict.csv"));
    predictedCsvSaver.setInstances(test1DataSet);
    predictedCsvSaver.writeBatch();

    System.out.println("Prediciton saved to predict.csv");
  }

コード例 #3

0

ファイルを表示

ファイル: PredictorFunctions.java プロジェクト: Komaa/Hotspots-NYC

 private static int calculatenpeople(Classifier ibkregression, Instance instregression) {
   Double predictValue = null;
   try {
     predictValue = ibkregression.classifyInstance(instregression);
   } catch (Exception e) {
     e.printStackTrace();
   }
   return predictValue.intValue();
 }

コード例 #4

0

ファイルを表示

ファイル: Tritrainer.java プロジェクト: xuleiboy1234/HyREX

 // 通过h1,h2分类器学习样本集，将h1,h2分类决策相同的样本放入L中，得到标记集合；
 public void updateL(Classifier h1, Classifier h2, Instances L, Instances test) {
   int length = unlabeledIns.numInstances();
   double value1 = 0.0, value2 = 0.0;
   try {
     for (int i = 0; i < length; i++) {
       value1 = h1.classifyInstance(test.instance(i));
       value2 = h2.classifyInstance(test.instance(i));
       if (value1 == value2) {
         // 当两个分类器做出相同决策时重新标记样本的类别；
         test.instance(i).setClassValue(value1);
         L.add(test.instance(i));
       }
     }
   } catch (Exception e) {
     System.out.println(e);
   }
   // return false;
 }

コード例 #5

0

ファイルを表示

ファイル: MessageClassify.java プロジェクト: yuyefeiwu20160615/NERDisambiguationBySVM

  /** 分类过程 */
  public double classifyMessage(String message) throws Exception {

    filter.input(makeInstance(message, instances.stringFreeStructure()));
    Instance filteredInstance = filter.output(); // 必须使用原来的filter

    double predicted = classifier.classifyInstance(filteredInstance); // (int)predicted是类标索引
    //        System.out.println("Message classified as : "
    //                + instances.classAttribute().value((int) predicted));
    return predicted;
  }

コード例 #6

0

ファイルを表示

ファイル: classifierThread.java プロジェクト: GuodongLong/TTAN

 public double getAccuracy(Classifier classifier, Instances test) throws Exception {
   for (Instance instance : test) {
     int predClass = (int) classifier.classifyInstance(instance);
     int realClass = (int) instance.classValue();
     if (predClass == realClass) {
       corrCnt++;
     }
   }
   return (double) corrCnt / totCnt;
 }

コード例 #7

0

ファイルを表示

ファイル: RowSelector.java プロジェクト: YalingZheng/BudgetedLearningNB

  public int SelectRow_KLDivergenceMisclassified(
      Instances pool, Classifier myEstimator, int desiredAttr) {

    // for each instance with unbought desiredAttr and label = desiredLabel
    // measure KL-divergence (relative entropy between two prob distributions):
    //  KL(P||Q) = sum_i  p_i log (p_i/q_i)
    // withr respect to Q = Uniform, we have
    //  KL(P||U) = sum_i p_i log(p_i)
    // choose (row) that is minimum (i.e. closest to uniform)

    int numInstances = pool.numInstances();
    double[] KLDivs = new double[numInstances];
    boolean[] isValidInstance = new boolean[numInstances];
    boolean misclassified = false;
    double[] probs = null;
    Instance inst;

    for (int i = 0; i < numInstances; i++) {
      inst = pool.instance(i);
      try {
        if (inst.classValue() != myEstimator.classifyInstance(inst)) misclassified = true;
        else misclassified = false;
      } catch (Exception e1) {
        // TODO Auto-generated catch block
        e1.printStackTrace();
      }
      if (inst.isMissing(desiredAttr) && misclassified) {
        try {
          probs = myEstimator.distributionForInstance(inst);
        } catch (Exception e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
        for (int j = 0; j < probs.length; j++) KLDivs[i] += MyXLogX(probs[j]);
        isValidInstance[i] = true;
      } else {
        KLDivs[i] = Double.MAX_VALUE;
        isValidInstance[i] = false;
      }
    }

    double leastDivergence = KLDivs[Utils.minIndex(KLDivs)];
    int numLeastDivs = 0;
    for (int i = 0; i < numInstances; i++)
      if (isValidInstance[i] && KLDivs[i] == leastDivergence) numLeastDivs++;
    int randomInstance = r.nextInt(numLeastDivs);
    int index = 0;
    for (int i = 0; i < numInstances; i++) {
      if (isValidInstance[i] && KLDivs[i] == leastDivergence) {
        if (index == randomInstance) return i;
        else index++;
      }
    }
    return -1;
  }

コード例 #8

0

ファイルを表示

ファイル: PredictorFunctions.java プロジェクト: Komaa/Hotspots-NYC

 private static boolean calcultateifhotspot(
     Classifier ibkclassification, Instance instclassification) {
   Double predictValue = null;
   try {
     predictValue = ibkclassification.classifyInstance(instclassification);
   } catch (Exception e) {
     e.printStackTrace();
   }
   if (predictValue == 0.0) return true;
   else return false;
 }

コード例 #9

0

ファイルを表示

ファイル: MyWekaExplorer.java プロジェクト: Teofebano19/MyANN

  public void classify(String filename) throws Exception {
    Instances unLabeledData = DataSource.read(filename);
    unLabeledData.setClassIndex(unLabeledData.numAttributes() - 1);
    Instances LabeledData = new Instances(unLabeledData);

    for (int i = 0; i < unLabeledData.numInstances(); ++i) {
      double clsLabel = classifier.classifyInstance(unLabeledData.instance(i));
      LabeledData.instance(i).setClassValue(clsLabel);
    }
    System.out.println(LabeledData.toString());
  }

コード例 #10

0

ファイルを表示

ファイル: MultiClassifier.java プロジェクト: peizhe/kaggle-knowledge

 @Override
 public double classifyInstance(Instance instance) throws Exception {
   double sum = 0.0;
   for (Classifier classifier : classifiers) {
     double classification = classifier.classifyInstance(instance);
     sum += classification;
   }
   if (sum >= classifiers.size() / 2) {
     return 1.0;
   }
   return 0.0;
 }

コード例 #11

0

ファイルを表示

ファイル: CosineSimilarityBayesNetClassifier.java プロジェクト: kedz/EventQA

 public String classifyInstance(Instance wekaInstance) {
   String label = null;
   try {
     double labelIndex = bayesNet.classifyInstance(wekaInstance);
     wekaInstance.setClassValue(labelIndex);
     label = wekaInstance.toString(wekaInstance.classIndex());
   } catch (Exception e) {
     System.err.println(e.getMessage());
     e.printStackTrace();
     System.exit(-1);
   }
   return label;
 }

コード例 #12

0

ファイルを表示

ファイル: WekaClassifier.java プロジェクト: zeitgeist87/Sentiment-Analysis-on-OpenStack

  /**
   * classifies a tweet
   *
   * @param stringa the tweet to classify
   * @return the tweet polarity
   */
  public double classifyDouble(String stringa)
      throws FileNotFoundException, IOException, Exception {
    String string_new;
    Preprocesser pre = new Preprocesser();
    string_new = pre.preprocessDocument(stringa);
    String tmp = "";
    StringTokenizer st = new StringTokenizer(string_new, " ");
    // Instances unlabeled = new Instances(new BufferedReader(new
    // FileReader("D:/prova.arff")));
    Instances unlabeled = new Instances(_train, 1);
    Instance inst = new Instance(unlabeled.numAttributes());
    // load unlabeled data
    inst.setDataset(unlabeled);
    int j = 0;
    while (j < unlabeled.numAttributes()) {
      inst.setValue(j, "0");
      j++;
    }
    while (st.hasMoreTokens()) {
      tmp = st.nextToken();
      if (unlabeled.attribute(tmp) != null) inst.setValue(unlabeled.attribute(tmp), "1");
    }
    unlabeled.add(inst);

    // set class attribute
    unlabeled.setClassIndex(unlabeled.numAttributes() - 1);

    // create copy
    Instances labeled = new Instances(unlabeled);

    // label instances
    for (int i = 0; i < unlabeled.numInstances(); i++) {
      double clsLabel = _cl.classifyInstance(unlabeled.instance(i));
      labeled.instance(i).setClassValue(clsLabel);
    }

    // return labeled.instance(0).stringValue(unlabeled.numAttributes()-1);
    // System.out.println("weight: " + labeled.instance(0).weight());
    return labeled.instance(0).classValue();
  }

コード例 #13

0

ファイルを表示

ファイル: MultiScheme.java プロジェクト: bigbigbug/wekax

  /**
   * Classifies a given instance using the selected classifier.
   *
   * @param instance the instance to be classified
   * @exception Exception if instance could not be classified successfully
   */
  public double classifyInstance(Instance instance) throws Exception {

    return m_Classifier.classifyInstance(instance);
  }

コード例 #14

0

ファイルを表示

ファイル: main_f.java プロジェクト: kavyasrinet/CIFAR-10

  public static void main(String[] args) throws Exception {
    // NaiveBayesSimple nb = new NaiveBayesSimple();

    //		BufferedReader br_train = new BufferedReader(new FileReader("src/train.arff.txt"));
    //		String s = null;
    //		long st_time = System.currentTimeMillis();
    //		Instances inst_train = new Instances(br_train);
    //		System.out.println(inst_train.numAttributes());
    //		inst_train.setClassIndex(inst_train.numAttributes()-1);
    //		System.out.println("train time"+(System.currentTimeMillis()-st_time));
    // NaiveBayes nb1 = new NaiveBayes();
    // nb1.buildClassifier(inst_train);
    // br_train.close();
    long st_time = System.currentTimeMillis();
    st_time = System.currentTimeMillis();

    Classifier classifier = (Classifier) SerializationHelper.read("NaiveBayes.model");

    //		BufferedReader br_test = new BufferedReader(new FileReader("src/test.arff.txt"));
    //		Instances inst_test = new Instances(br_test);
    //		inst_test.setClassIndex(inst_test.numAttributes()-1);
    //		System.out.println("test time"+(System.currentTimeMillis()-st_time));
    //

    ArffLoader testLoader = new ArffLoader();
    testLoader.setSource(new File("src/test.arff"));
    testLoader.setRetrieval(Loader.BATCH);
    Instances testDataSet = testLoader.getDataSet();

    Attribute testAttribute = testDataSet.attribute("class");
    testDataSet.setClass(testAttribute);

    int correct = 0;
    int incorrect = 0;
    FastVector attInfo = new FastVector();
    attInfo.addElement(new Attribute("Id"));
    attInfo.addElement(new Attribute("Category"));

    Instances outputInstances = new Instances("predict", attInfo, testDataSet.numInstances());

    Enumeration testInstances = testDataSet.enumerateInstances();
    int index = 1;
    while (testInstances.hasMoreElements()) {
      Instance instance = (Instance) testInstances.nextElement();
      double classification = classifier.classifyInstance(instance);
      Instance predictInstance = new Instance(outputInstances.numAttributes());
      predictInstance.setValue(0, index++);
      predictInstance.setValue(1, (int) classification + 1);
      outputInstances.add(predictInstance);
    }

    System.out.println("Correct Instance: " + correct);
    System.out.println("IncCorrect Instance: " + incorrect);
    double accuracy = (double) (correct) / (double) (correct + incorrect);
    System.out.println("Accuracy: " + accuracy);
    CSVSaver predictedCsvSaver = new CSVSaver();
    predictedCsvSaver.setFile(new File("predict.csv"));
    predictedCsvSaver.setInstances(outputInstances);
    predictedCsvSaver.writeBatch();

    System.out.println("Prediciton saved to predict.csv");
  }

コード例 #15

0

ファイルを表示

ファイル: QuestionClassifierService.java プロジェクト: AndroidForks/qademo

  // 输入问题，输出问题所属类型。
  public double classifyByBayes(String question) throws Exception {
    double label = -1;
    List<Question> questionID = questionDAO.getQuestionIDLabeled();

    // 定义数据格式
    Attribute att1 = new Attribute("法律政策");
    Attribute att2 = new Attribute("位置交通");
    Attribute att3 = new Attribute("风水");
    Attribute att4 = new Attribute("房价");
    Attribute att5 = new Attribute("楼层");
    Attribute att6 = new Attribute("户型");
    Attribute att7 = new Attribute("小区配套");
    Attribute att8 = new Attribute("贷款");
    Attribute att9 = new Attribute("买房时机");
    Attribute att10 = new Attribute("开发商");
    FastVector labels = new FastVector();
    labels.addElement("1");
    labels.addElement("2");
    labels.addElement("3");
    labels.addElement("4");
    labels.addElement("5");
    labels.addElement("6");
    labels.addElement("7");
    labels.addElement("8");
    labels.addElement("9");
    labels.addElement("10");
    Attribute att11 = new Attribute("类别", labels);

    FastVector attributes = new FastVector();
    attributes.addElement(att1);
    attributes.addElement(att2);
    attributes.addElement(att3);
    attributes.addElement(att4);
    attributes.addElement(att5);
    attributes.addElement(att6);
    attributes.addElement(att7);
    attributes.addElement(att8);
    attributes.addElement(att9);
    attributes.addElement(att10);
    attributes.addElement(att11);
    Instances dataset = new Instances("Test-dataset", attributes, 0);
    dataset.setClassIndex(10);

    Classifier classifier = null;
    if (!new File("naivebayes.model").exists()) {
      // 添加数据
      double[] values = new double[11];
      for (int i = 0; i < questionID.size(); i++) {
        for (int m = 0; m < 11; m++) {
          values[m] = 0;
        }
        int whitewordcount = 0;
        whitewordcount = questionDAO.getHitWhiteWordNum(questionID.get(i).getId());
        if (whitewordcount != 0) {
          List<QuestionWhiteWord> questionwhiteword =
              questionDAO.getHitQuestionWhiteWord(questionID.get(i).getId());
          for (int j = 0; j < questionwhiteword.size(); j++) {
            values[getAttIndex(questionwhiteword.get(j).getWordId()) - 1]++;
          }
          for (int m = 0; m < 11; m++) {
            values[m] = values[m] / whitewordcount;
          }
        }
        values[10] = questionID.get(i).getType() - 1;
        Instance inst = new Instance(1.0, values);
        dataset.add(inst);
      }
      // 构造分类器
      classifier = new NaiveBayes();
      classifier.buildClassifier(dataset);
      SerializationHelper.write("naivebayes.model", classifier);
    } else {
      classifier = (Classifier) SerializationHelper.read("naivebayes.model");
    }

    System.out.println("*************begin evaluation*******************");
    Evaluation evaluation = new Evaluation(dataset);
    evaluation.evaluateModel(classifier, dataset); // 按道理说，这里应该使用另一份数据，而不是训练集data。
    System.out.println(evaluation.toSummaryString());

    // 分类
    System.out.println("*************begin classification*******************");
    Instance subject = new Instance(1.0, getQuestionVector(question));
    subject.setDataset(dataset);
    label = classifier.classifyInstance(subject);
    System.out.println("label: " + label);

    //        double dis[]=classifier.distributionForInstance(inst);
    //        for(double i:dis){
    //            System.out.print(i+"    ");
    //        }

    System.out.println(questionID.size());
    return label + 1;
  }

コード例 #16

0

ファイルを表示

ファイル: GroovyClassifier.java プロジェクト: alishakiba/jDenetX

 /**
  * Classifies a given instance.
  *
  * @param instance the instance to be classified
  * @return index of the predicted class
  * @throws Exception if an error occurred during the prediction
  */
 public double classifyInstance(Instance instance) throws Exception {
   if (m_GroovyObject != null) return m_GroovyObject.classifyInstance(instance);
   else return Utils.missingValue();
 }

コード例 #17

0

ファイルを表示

ファイル: RemoveMisclassified.java プロジェクト: naranil/weka

  /**
   * Cleanses the data based on misclassifications when used training data.
   *
   * @param data the data to train with and cleanse
   * @return the cleansed data
   * @throws Exception if something goes wrong
   */
  private Instances cleanseTrain(Instances data) throws Exception {

    Instance inst;
    Instances buildSet = new Instances(data);
    Instances temp = new Instances(data, data.numInstances());
    Instances inverseSet = new Instances(data, data.numInstances());
    int count = 0;
    double ans;
    int iterations = 0;
    int classIndex = m_classIndex;
    if (classIndex < 0) {
      classIndex = data.classIndex();
    }
    if (classIndex < 0) {
      classIndex = data.numAttributes() - 1;
    }

    // loop until perfect
    while (count != buildSet.numInstances()) {

      // check if hit maximum number of iterations
      iterations++;
      if (m_numOfCleansingIterations > 0 && iterations > m_numOfCleansingIterations) {
        break;
      }

      // build classifier
      count = buildSet.numInstances();
      buildSet.setClassIndex(classIndex);
      m_cleansingClassifier.buildClassifier(buildSet);

      temp = new Instances(buildSet, buildSet.numInstances());

      // test on training data
      for (int i = 0; i < buildSet.numInstances(); i++) {
        inst = buildSet.instance(i);
        ans = m_cleansingClassifier.classifyInstance(inst);
        if (buildSet.classAttribute().isNumeric()) {
          if (ans >= inst.classValue() - m_numericClassifyThreshold
              && ans <= inst.classValue() + m_numericClassifyThreshold) {
            temp.add(inst);
          } else if (m_invertMatching) {
            inverseSet.add(inst);
          }
        } else { // class is nominal
          if (ans == inst.classValue()) {
            temp.add(inst);
          } else if (m_invertMatching) {
            inverseSet.add(inst);
          }
        }
      }
      buildSet = temp;
    }

    if (m_invertMatching) {
      inverseSet.setClassIndex(data.classIndex());
      return inverseSet;
    } else {
      buildSet.setClassIndex(data.classIndex());
      return buildSet;
    }
  }

コード例 #18

0

ファイルを表示

ファイル: Driver.java プロジェクト: pavitra29/Data-Mining

  /**
   * @param args the command line arguments
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {
    PreProcessor p = new PreProcessor("census-income.data", "census-income-preprocessed.arff");

    p.smote();

    PreProcessor p_test =
        new PreProcessor("census-income.test", "census-income-test-preprocessed.arff");

    p_test.run();

    BufferedReader traindata =
        new BufferedReader(new FileReader("census-income-preprocessed.arff"));
    BufferedReader testdata =
        new BufferedReader(new FileReader("census-income-test-preprocessed.arff"));
    Instances traininstance = new Instances(traindata);
    Instances testinstance = new Instances(testdata);

    traindata.close();
    testdata.close();
    traininstance.setClassIndex(traininstance.numAttributes() - 1);
    testinstance.setClassIndex(testinstance.numAttributes() - 1);
    int numOfAttributes = testinstance.numAttributes();
    int numOfInstances = testinstance.numInstances();

    NaiveBayesClassifier nb = new NaiveBayesClassifier("census-income-preprocessed.arff");
    Classifier cnaive = nb.NBClassify();

    DecisionTree dt = new DecisionTree("census-income-preprocessed.arff");
    Classifier cls = dt.DTClassify();

    AdaBoost ab = new AdaBoost("census-income-preprocessed.arff");
    AdaBoostM1 m1 = ab.AdaBoostDTClassify();

    BaggingMethod b = new BaggingMethod("census-income-preprocessed.arff");
    Bagging bag = b.BaggingDTClassify();

    SVM s = new SVM("census-income-preprocessed.arff");
    SMO svm = s.SMOClassifier();

    knn knnclass = new knn("census-income-preprocessed.arff");
    IBk knnc = knnclass.knnclassifier();

    Logistic log = new Logistic();
    log.buildClassifier(traininstance);

    int match = 0;
    int error = 0;
    int greater = 0;
    int less = 0;

    for (int i = 0; i < numOfInstances; i++) {
      String predicted = "";
      greater = 0;
      less = 0;
      double predictions[] = new double[8];

      double pred = cls.classifyInstance(testinstance.instance(i));
      predictions[0] = pred;

      double abpred = m1.classifyInstance(testinstance.instance(i));
      predictions[1] = abpred;

      double naivepred = cnaive.classifyInstance(testinstance.instance(i));
      predictions[2] = naivepred;

      double bagpred = bag.classifyInstance(testinstance.instance(i));
      predictions[3] = bagpred;

      double smopred = svm.classifyInstance(testinstance.instance(i));
      predictions[4] = smopred;

      double knnpred = knnc.classifyInstance(testinstance.instance(i));
      predictions[5] = knnpred;

      for (int j = 0; j < 6; j++) {
        if ((testinstance.instance(i).classAttribute().value((int) predictions[j]))
                .compareTo(">50K")
            == 0) greater++;
        else less++;
      }

      if (greater > less) predicted = ">50K";
      else predicted = "<=50K";

      if ((testinstance.instance(i).stringValue(numOfAttributes - 1)).compareTo(predicted) == 0)
        match++;
      else error++;
    }

    System.out.println("Correctly classified Instances: " + match);
    System.out.println("Misclassified Instances: " + error);

    double accuracy = (double) match / (double) numOfInstances * 100;
    double error_percent = 100 - accuracy;
    System.out.println("Accuracy: " + accuracy + "%");
    System.out.println("Error: " + error_percent + "%");
  }

コード例 #19

0

ファイルを表示

ファイル: RemoveMisclassified.java プロジェクト: naranil/weka

  /**
   * Cleanses the data based on misclassifications when performing cross-validation.
   *
   * @param data the data to train with and cleanse
   * @return the cleansed data
   * @throws Exception if something goes wrong
   */
  private Instances cleanseCross(Instances data) throws Exception {

    Instance inst;
    Instances crossSet = new Instances(data);
    Instances temp = new Instances(data, data.numInstances());
    Instances inverseSet = new Instances(data, data.numInstances());
    int count = 0;
    double ans;
    int iterations = 0;
    int classIndex = m_classIndex;
    if (classIndex < 0) {
      classIndex = data.classIndex();
    }
    if (classIndex < 0) {
      classIndex = data.numAttributes() - 1;
    }

    // loop until perfect
    while (count != crossSet.numInstances()
        && crossSet.numInstances() >= m_numOfCrossValidationFolds) {

      count = crossSet.numInstances();

      // check if hit maximum number of iterations
      iterations++;
      if (m_numOfCleansingIterations > 0 && iterations > m_numOfCleansingIterations) {
        break;
      }

      crossSet.setClassIndex(classIndex);

      if (crossSet.classAttribute().isNominal()) {
        crossSet.stratify(m_numOfCrossValidationFolds);
      }
      // do the folds
      temp = new Instances(crossSet, crossSet.numInstances());

      for (int fold = 0; fold < m_numOfCrossValidationFolds; fold++) {
        Instances train = crossSet.trainCV(m_numOfCrossValidationFolds, fold);
        m_cleansingClassifier.buildClassifier(train);
        Instances test = crossSet.testCV(m_numOfCrossValidationFolds, fold);
        // now test
        for (int i = 0; i < test.numInstances(); i++) {
          inst = test.instance(i);
          ans = m_cleansingClassifier.classifyInstance(inst);
          if (crossSet.classAttribute().isNumeric()) {
            if (ans >= inst.classValue() - m_numericClassifyThreshold
                && ans <= inst.classValue() + m_numericClassifyThreshold) {
              temp.add(inst);
            } else if (m_invertMatching) {
              inverseSet.add(inst);
            }
          } else { // class is nominal
            if (ans == inst.classValue()) {
              temp.add(inst);
            } else if (m_invertMatching) {
              inverseSet.add(inst);
            }
          }
        }
      }
      crossSet = temp;
    }

    if (m_invertMatching) {
      inverseSet.setClassIndex(data.classIndex());
      return inverseSet;
    } else {
      crossSet.setClassIndex(data.classIndex());
      return crossSet;
    }
  }

コード例 #20

0

ファイルを表示

ファイル: HTDetectionML.java プロジェクト: hicannon/TAC-2013-KBP-English-Sentiment-Slot-Filling

  public HashMap<String, String> process(
      Sentence sent,
      String dep,
      HashSet<String> terms,
      List<NamedEntity> entities,
      String author,
      String aidx) {
    try {
      // System.out.println("ML start!");
      // System.out.println("List : " + terms);
      HashMap<String, String> ht = new HashMap<String, String>();

      List<NamedEntity> newEntities = new ArrayList<NamedEntity>();
      for (NamedEntity entity : entities) {
        // System.out.println("original: " + entity.entity);
        boolean check = false;

        for (NamedEntity temp : entities) {
          if (entity == temp) continue;

          if (entity.entity.contains(temp.entity)) {
            check = true;
          }
        }

        if (!check) newEntities.add(entity);
      }

      List<DependencyTriple> dtl = getDependencyTripleList(dep);
      List<NamedEntity> targetCands = new ArrayList<NamedEntity>();
      HashMap<NamedEntity, String> tOpinTerm = new HashMap<NamedEntity, String>();
      List<NamedEntity> holderCands = new ArrayList<NamedEntity>();
      HashMap<NamedEntity, String> hOpinTerm = new HashMap<NamedEntity, String>();

      BufferedWriter writer = new BufferedWriter(new FileWriter("weka_target.csv"));
      writer.write("A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,Class\n");

      boolean check = false;
      List<NamedEntity> targetTmp = new ArrayList<NamedEntity>();
      for (NamedEntity entity : newEntities) {
        // System.out.println("extracted: " + entity.entity);
        String temp = getTargetFeatures(entity, author, terms, dtl);
        // System.out.println(temp);
        if (temp.length() > 1) {
          check = true;
          writer.write(temp);
          String[] toks = temp.split("\n");
          for (int i = 0; i < toks.length; i++) {
            targetTmp.add(entity);
            tOpinTerm.put(entity, toks[i].substring(0, toks[i].indexOf(",")));
          }
        }
      }

      writer.close();

      if (check) {
        DataSource source = new DataSource("weka_target.csv");
        Instances testdata = source.getDataSet();
        testdata.setClassIndex(testdata.numAttributes() - 1);

        Classifier models = (Classifier) weka.core.SerializationHelper.read("target_smoreg.model");

        if (testdata.numInstances() != targetTmp.size())
          System.out.println("wrong number of instances");

        for (int i = 0; i < testdata.numInstances(); i++) {
          double pred = models.classifyInstance(testdata.instance(i));
          if (pred >= 1.0) {
            // System.out.println(pred + " , " + targetTmp.get(i).entity);
            targetCands.add(targetTmp.get(i));
          }
        }
      }

      writer = new BufferedWriter(new FileWriter("weka_holder.csv"));
      writer.write("A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,Class\n");

      check = false;
      List<NamedEntity> holderTmp = new ArrayList<NamedEntity>();
      for (NamedEntity entity : newEntities) {
        // System.out.println("extracted: " + entity.entity);
        String temp = getHolderFeatures(entity, author, terms, dtl);
        // System.out.println(temp);
        if (temp.length() > 1) {
          check = true;
          writer.write(temp);
          String[] toks = temp.split("\n");
          for (int i = 0; i < toks.length; i++) {
            holderTmp.add(entity);
            hOpinTerm.put(entity, toks[i].substring(0, toks[i].indexOf(",")));
          }
        }
      }

      writer.close();

      if (check) {
        DataSource source = new DataSource("weka_holder.csv");
        Instances testdata = source.getDataSet();
        testdata.setClassIndex(testdata.numAttributes() - 1);

        Classifier models = (Classifier) weka.core.SerializationHelper.read("holder_smoreg.model");

        if (testdata.numInstances() != holderTmp.size())
          System.out.println("wrong number of instances");

        for (int i = 0; i < testdata.numInstances(); i++) {
          double pred = models.classifyInstance(testdata.instance(i));
          if (pred >= 1.0) {
            // System.out.println(pred + " , " + holderTmp.get(i).entity);
            holderCands.add(holderTmp.get(i));
          }
        }
      }

      if ((targetCands.size() == 0) || (holderCands.size() == 0)) return ht;

      List<NamedEntity> holderCandTmp = new ArrayList<NamedEntity>();
      for (NamedEntity holderCand : holderCands) {
        boolean hasLonger = false;
        for (NamedEntity temp : holderCands) {
          if (temp.entity.compareTo(holderCand.entity) == 0) continue;

          if (temp.entity.contains(holderCand.entity)) {
            hasLonger = true;
            break;
          }
        }

        if (!hasLonger) holderCandTmp.add(holderCand);
      }

      List<NamedEntity> targetCandTmp = new ArrayList<NamedEntity>();
      for (NamedEntity targetCand : targetCands) {
        boolean hasLonger = false;
        for (NamedEntity temp : targetCands) {
          if (temp.entity.compareTo(targetCand.entity) == 0) continue;

          if (temp.entity.contains(targetCand.entity)) {
            hasLonger = true;
            break;
          }
        }

        if (!hasLonger) targetCandTmp.add(targetCand);
      }

      for (NamedEntity targetCand : targetCandTmp) {
        if (targetCand.entity.compareTo(author) == 0) continue;

        for (NamedEntity holderCand : holderCandTmp) {
          if (targetCand.entity.compareTo(holderCand.entity) == 0) continue;

          String targetOpin = tOpinTerm.get(targetCand);
          String holderOpin = hOpinTerm.get(holderCand);

          // System.out.println(targetOpin + ", " + holderOpin);
          if (targetOpin.compareTo(holderOpin) != 0) continue;

          String opin =
              targetOpin
                  .concat("\t")
                  .concat(
                      Integer.toString(sent.sent.indexOf(targetOpin) + sent.beg)
                          .concat("-")
                          .concat(
                              Integer.toString(
                                  sent.sent.indexOf(targetOpin) + sent.beg + targetOpin.length())));

          String holder =
              holderCand
                  .entity
                  .concat("\t")
                  .concat(
                      Integer.toString(holderCand.beg)
                          .concat("-")
                          .concat(Integer.toString(holderCand.end)));
          String target =
              targetCand
                  .entity
                  .concat("\t")
                  .concat(
                      Integer.toString(targetCand.beg)
                          .concat("-")
                          .concat(Integer.toString(targetCand.end)));
          ht.put(targetOpin, opin.concat("\t").concat(holder).concat("\t").concat(target));
        }
      }

      return ht;
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      return null;
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      return null;
    }
  }