예제 #1
0
  // 构造一个tri-trainer分类器。
  public Tritrainer(
      String classifier, String trainingIns_File, String testIns_File, double precentage) {
    try {
      this.classifier1 = (Classifier) Class.forName(classifier).newInstance();
      this.classifier2 = (Classifier) Class.forName(classifier).newInstance();
      this.classifier3 = (Classifier) Class.forName(classifier).newInstance();

      Instances trainingInstances = Util.getInstances(trainingIns_File);

      // 将trainIns_File按照precentage和(1-precentage)的比例切割成labeledIns和unlabeledIns;
      int length = trainingInstances.numInstances();
      int i = new Double(length * precentage).intValue();
      labeledIns = new Instances(trainingInstances, 0);
      for (int j = 0; j < i; j++) {
        labeledIns.add(trainingInstances.firstInstance());
        trainingInstances.delete(0);
      }
      unlabeledIns = trainingInstances;
      testIns = Util.getInstances(testIns_File);

      Init();
    } catch (Exception e) {

    }
  }
예제 #2
0
 // 将样本集中裁剪提取成m个样本组成的集合;
 public void SubSample(Instances inst, int m) {
   inst.randomize(new Random());
   while (inst.numInstances() != m) {
     inst.delete(0);
   }
   // System.out.println("subsample:=" + inst.numInstances() + " m:=" + m );
 }
예제 #3
0
  /**
   * This will remove all buffered instances from the inputformat dataset. Use this method rather
   * than getInputFormat().delete();
   */
  protected void flushInput() {

    if ((m_InputStringAtts.getAttributeIndices().length > 0)
        || (m_InputRelAtts.getAttributeIndices().length > 0)) {
      m_InputFormat = m_InputFormat.stringFreeStructure();
      m_InputStringAtts = new StringLocator(m_InputFormat, m_InputStringAtts.getAllowedIndices());
      m_InputRelAtts = new RelationalLocator(m_InputFormat, m_InputRelAtts.getAllowedIndices());
    } else {
      // This more efficient than new Instances(m_InputFormat, 0);
      m_InputFormat.delete();
    }
  }
예제 #4
0
  /**
   * Process the input and compute the statistics of the training data
   *
   * @return an Input object which holds the statistics about the training data
   */
  private Input sIB_ProcessInput() {
    double valSum = 0.0;
    for (int i = 0; i < m_numInstances; i++) {
      valSum = 0.0;
      for (int v = 0; v < m_data.instance(i).numValues(); v++) {
        valSum += m_data.instance(i).valueSparse(v);
      }
      if (valSum <= 0) {
        if (m_verbose) {
          System.out.format("Instance %s sum of value = %s <= 0, removed.\n", i, valSum);
        }
        m_data.delete(i);
        m_numInstances--;
      }
    }

    // get the term-document matrix
    Input input = new Input();
    input.Py_x = getTransposedNormedMatrix(m_data);
    if (m_uniformPrior) {
      input.Pyx = input.Py_x.copy();
      normalizePrior(m_data);
    } else {
      input.Pyx = getTransposedMatrix(m_data);
    }
    input.sumVals = getTotalSum(m_data);
    input.Pyx.timesEquals(1 / input.sumVals);

    // prior probability of documents, ie. sum the columns from the Pyx matrix
    input.Px = new double[m_numInstances];
    for (int i = 0; i < m_numInstances; i++) {
      for (int j = 0; j < m_numAttributes; j++) {
        input.Px[i] += input.Pyx.get(j, i);
      }
    }

    // prior probability of terms, ie. sum the rows from the Pyx matrix
    input.Py = new double[m_numAttributes];
    for (int i = 0; i < input.Pyx.getRowDimension(); i++) {
      for (int j = 0; j < input.Pyx.getColumnDimension(); j++) {
        input.Py[i] += input.Pyx.get(i, j);
      }
    }

    MI(input.Pyx, input);
    return input;
  }
예제 #5
0
  /**
   * Read the data set incrementally---get the next instance in the data set or returns null if
   * there are no more instances to get. If the structure hasn't yet been determined by a call to
   * getStructure then method should do so before returning the next instance in the data set.
   *
   * @return the next instance in the data set as an Instance object or null if there are no more
   *     instances to be read
   * @exception IOException if there is an error during parsing
   */
  public Instance getNextInstance() throws IOException {

    if (m_structure == null) {
      getStructure();
    }
    if (getRetrieval() == BATCH) {
      throw new IOException("Cannot mix getting Instances in both incremental and batch modes");
    }
    setRetrieval(INCREMENTAL);

    if (!m_structure.readInstance(m_sourceReader)) {
      return null;
    }

    Instance current = m_structure.instance(0);
    m_structure.delete(0);
    return current;
  }
예제 #6
0
  /**
   * Adds the supplied instance to the training set.
   *
   * @param instance the instance to add
   * @throws Exception if instance could not be incorporated successfully
   */
  public void updateClassifier(Instance instance) throws Exception {

    if (m_Train.equalHeaders(instance.dataset()) == false) {
      throw new Exception("Incompatible instance types");
    }
    if (instance.classIsMissing()) {
      return;
    }

    m_Train.add(instance);
    m_NNSearch.update(instance);
    m_kNNValid = false;
    if ((m_WindowSize > 0) && (m_Train.numInstances() > m_WindowSize)) {
      boolean deletedInstance = false;
      while (m_Train.numInstances() > m_WindowSize) {
        m_Train.delete(0);
        deletedInstance = true;
      }
      // rebuild datastructure KDTree currently can't delete
      if (deletedInstance == true) m_NNSearch.setInstances(m_Train);
    }
  }
예제 #7
0
  /**
   * Process an instance. When the batch size is met then a batch of predictions is returned;
   * returns null if the batch size has not been matched yet
   *
   * @param inst the instance to process
   * @return a batch of predictions or null if we have not seen enough input instances to meet the
   *     batch size yet
   * @throws DistributedWekaException if a problem occurs
   */
  public double[][] processInstanceBatchPredictor(Instance inst) throws DistributedWekaException {

    inst = mapIncomingFieldsToModelFields(inst);
    m_batchScoringData.add(inst);

    if (m_batchScoringData.numInstances() == m_batchSize) {
      try {
        double[][] predictions = m_model.distributionsForInstances(m_batchScoringData);
        if (predictions.length != m_batchScoringData.numInstances()) {
          throw new Exception(
              "Number of predictions did not match the number " + "of instances in the batch");
        }

        m_batchScoringData.delete();

        return predictions;
      } catch (Exception ex) {
        throw new DistributedWekaException(ex);
      }
    }

    return null;
  }
  public static void execSVM(String expName) {
    try {
      FileWriter outFile = null;
      PrintWriter out = null;
      outFile = new FileWriter(expName + "-SVM.results");
      out = new PrintWriter(outFile);
      DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
      ProcessTweets tweetsProcessor = null;
      System.out.println("***************************************");
      System.out.println("***\tEXECUTING TEST\t" + expName + "***");
      System.out.println("+++++++++++++++++++++++++++++++++++++++");
      out.println("***************************************");
      out.println("***\tEXECUTING TEST\t" + expName + "***");
      out.println("+++++++++++++++++++++++++++++++++++++++");
      out.println("4-Generate classifier " + dateFormat.format(new Date()));

      Classifier cls = null;
      DataSource sourceTrain = new DataSource(expName + "-train.arff");
      Instances dataTrain = sourceTrain.getDataSet();
      if (dataTrain.classIndex() == -1) dataTrain.setClassIndex(dataTrain.numAttributes() - 1);
      // Entreno el clasificador
      // cls = new weka.classifiers.functions.LibSVM();
      int clase = dataTrain.numAttributes() - 1;
      cls = new weka.classifiers.bayes.ComplementNaiveBayes();
      dataTrain.setClassIndex(clase);
      cls.buildClassifier(dataTrain);
      ObjectOutputStream oos =
          new ObjectOutputStream(new FileOutputStream(expName + "-SVM.classifier"));
      oos.writeObject(cls);
      oos.flush();
      oos.close();
      DataSource sourceTest = new DataSource(expName + "-test.arff");
      Instances dataTest = sourceTest.getDataSet();
      dataTest.setClassIndex(clase);
      Evaluation eval = new Evaluation(dataTest);
      eval.evaluateModel(cls, dataTest);
      // Ahora calculo los valores precision, recall y fmeasure. Además saco las matrices de
      // confusion

      float precision = 0;
      float recall = 0;
      float fmeasure = 0;
      int topeTopics = 8;
      for (int ind = 0; ind < topeTopics; ind++) {
        precision += eval.precision(ind);
        recall += eval.recall(ind);
        fmeasure += eval.fMeasure(ind);
      }
      precision = precision / topeTopics;
      recall = recall / topeTopics;
      fmeasure = fmeasure / topeTopics;
      System.out.println("++++++++++++++ CNB ++++++++++++++++++++");
      System.out.println(eval.toMatrixString());
      System.out.println("+++++++++++++++++++++++++++++++++++++++");
      System.out.printf("Precision: %.3f\n", precision);
      System.out.printf("Recall: %.3f\n", recall);
      System.out.printf("F-measure: %.3f\n", fmeasure);
      System.out.println("***************************************");
      out.println("++++++++++++++ CNB ++++++++++++++++++++");
      out.println(eval.toMatrixString());
      out.println("+++++++++++++++++++++++++++++++++++++++");
      out.printf("Precision: %.3f\n", precision);
      out.printf("Recall: %.3f\n", recall);
      out.printf("F-measure: %.3f\n", fmeasure);
      out.println("***************************************");
      // OTRO CLASIFICADOR ZeroR
      cls = new weka.classifiers.rules.ZeroR();
      dataTrain.setClassIndex(clase);
      cls.buildClassifier(dataTrain);
      eval = new Evaluation(dataTest);
      eval.evaluateModel(cls, dataTest);
      precision = 0;
      recall = 0;
      fmeasure = 0;
      for (int ind = 0; ind < topeTopics; ind++) {
        precision += eval.precision(ind);
        recall += eval.recall(ind);
        fmeasure += eval.fMeasure(ind);
      }
      precision = precision / topeTopics;
      recall = recall / topeTopics;
      fmeasure = fmeasure / topeTopics;
      System.out.println("++++++++++++++ ZEROR ++++++++++++++++++++");
      System.out.println(eval.toMatrixString());
      System.out.println("+++++++++++++++++++++++++++++++++++++++");
      System.out.printf("Precision: %.3f\n", precision);
      System.out.printf("Recall: %.3f\n", recall);
      System.out.printf("F-measure: %.3f\n", fmeasure);
      System.out.println("***************************************");
      out.println("++++++++++++++ ZEROR ++++++++++++++++++++");
      out.println(eval.toMatrixString());
      out.println("+++++++++++++++++++++++++++++++++++++++");
      out.printf("Precision: %.3f\n", precision);
      out.printf("Recall: %.3f\n", recall);
      out.printf("F-measure: %.3f\n", fmeasure);
      out.println("***************************************");
      // OTRO CLASIFICADOR J48
      /*
      			cls = new weka.classifiers.trees.J48();
      			dataTrain.setClassIndex(clase);
      			cls.buildClassifier(dataTrain);
      			eval = new Evaluation(dataTest);
      			eval.evaluateModel(cls, dataTest);
      			precision=0;
      			recall=0;
      			fmeasure=0;
      			for(int ind=0; ind<topeTopics; ind++)
      			{
      				precision += eval.precision(ind);
      				recall += eval.recall(ind);
      				fmeasure += eval.fMeasure(ind);
      			}
      			precision = precision / topeTopics;
      			recall = recall / topeTopics;
      			fmeasure = fmeasure / topeTopics;
      			System.out.println("++++++++++++++ J48 ++++++++++++++++++++");
      			System.out.println(eval.toMatrixString());
      			System.out.println("+++++++++++++++++++++++++++++++++++++++");
      			System.out.printf("Precision: %.3f\n", precision);
      			System.out.printf("Recall: %.3f\n", recall);
      			System.out.printf("F-measure: %.3f\n", fmeasure);
      			System.out.println("***************************************");
      			out.println("++++++++++++++ J48 ++++++++++++++++++++");
      			out.println(eval.toMatrixString());
      			out.println("+++++++++++++++++++++++++++++++++++++++");
      			out.printf("Precision: %.3f\n", precision);
      			out.printf("Recall: %.3f\n", recall);
      			out.printf("F-measure: %.3f\n", fmeasure);
      			out.println("***************************************");

      //OTRO SMO
      			cls = new weka.classifiers.functions.SMO();
      			dataTrain.setClassIndex(clase);
      			cls.buildClassifier(dataTrain);
      			eval = new Evaluation(dataTest);
      			eval.evaluateModel(cls, dataTest);
      			precision=0;
      			recall=0;
      			fmeasure=0;
      			for(int ind=0; ind<topeTopics; ind++)
      			{
      				precision += eval.precision(ind);
      				recall += eval.recall(ind);
      				fmeasure += eval.fMeasure(ind);
      			}
      			precision = precision / topeTopics;
      			recall = recall / topeTopics;
      			fmeasure = fmeasure / topeTopics;
      			System.out.println("++++++++++++++ SMO ++++++++++++++++++++");
      			System.out.println(eval.toMatrixString());
      			System.out.println("+++++++++++++++++++++++++++++++++++++++");
      			System.out.printf("Precision: %.3f\n", precision);
      			System.out.printf("Recall: %.3f\n", recall);
      			System.out.printf("F-measure: %.3f\n", fmeasure);
      			System.out.println("***************************************");
      			out.println("++++++++++++++ SMO ++++++++++++++++++++");
      			out.println(eval.toMatrixString());
      			out.println("+++++++++++++++++++++++++++++++++++++++");
      			out.printf("Precision: %.3f\n", precision);
      			out.printf("Recall: %.3f\n", recall);
      			out.printf("F-measure: %.3f\n", fmeasure);
      			out.println("***************************************");
      */
      out.flush();
      out.close();
      dataTest.delete();
      dataTrain.delete();
    } catch (FileNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
  public void exec(PrintWriter printer) {
    try {
      FileWriter outFile = null;
      PrintWriter out = null;
      if (printer == null) {
        outFile = new FileWriter(id + ".results");
        out = new PrintWriter(outFile);
      } else out = printer;

      DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
      ProcessTweets tweetsProcessor = null;
      System.out.println("***************************************");
      System.out.println("***\tEXECUTING TEST\t" + id + "***");
      System.out.println("+++++++++++++++++++++++++++++++++++++++");
      System.out.println("Train size:" + traincorpus.size());
      System.out.println("Test size:" + testcorpus.size());
      out.println("***************************************");
      out.println("***\tEXECUTING TEST\t***");
      out.println("+++++++++++++++++++++++++++++++++++++++");
      out.println("Train size:" + traincorpus.size());
      out.println("Test size:" + testcorpus.size());
      String cloneID = "";
      boolean clonar = false;
      if (baseline) {
        System.out.println("***************************************");
        System.out.println("***\tEXECUTING TEST BASELINE\t***");
        System.out.println("+++++++++++++++++++++++++++++++++++++++");
        System.out.println("Train size:" + traincorpus.size());
        System.out.println("Test size:" + testcorpus.size());
        out.println("***************************************");
        out.println("***\tEXECUTING TEST\t***");
        out.println("+++++++++++++++++++++++++++++++++++++++");
        out.println("Train size:" + traincorpus.size());
        out.println("Test size:" + testcorpus.size());

        BaselineClassifier base = new BaselineClassifier(testcorpus, 8);
        precision = base.getPrecision();
        recall = base.getRecall();
        fmeasure = base.getFmeasure();
        System.out.println("+++++++++++++++++++++++++++++++++++++++");
        System.out.printf("Precision: %.3f\n", precision);
        System.out.printf("Recall: %.3f\n", recall);
        System.out.printf("F-measure: %.3f\n", fmeasure);
        System.out.println("***************************************");
        out.println("+++++++++++++++++++++++++++++++++++++++");
        out.printf("Precision: %.3f\n", precision);
        out.printf("Recall: %.3f\n", recall);
        out.printf("F-measure: %.3f\n", fmeasure);
        out.println("***************************************");
        out.flush();
        out.close();
        return;
      } else {
        System.out.println("Stemming: " + stemming);
        System.out.println("Lematization:" + lematization);
        System.out.println("URLs:" + urls);
        System.out.println("Hashtags:" + hashtags);
        System.out.println("Mentions:" + mentions);
        System.out.println("Unigrams:" + unigrams);
        System.out.println("Bigrams:" + bigrams);
        System.out.println("TF:" + tf);
        System.out.println("TF-IDF:" + tfidf);
        out.println("Stemming: " + stemming);
        out.println("Lematization:" + lematization);
        out.println("URLs:" + urls);
        out.println("Hashtags:" + hashtags);
        out.println("Mentions:" + mentions);
        out.println("Unigrams:" + unigrams);
        out.println("Bigrams:" + bigrams);
        out.println("TF:" + tf);
        out.println("TF-IDF:" + tfidf);
      }
      // Si tengo los tweets procesados, me evito un nuevo proceso
      System.out.println("1-Process tweets " + dateFormat.format(new Date()));
      out.println("1-Process tweets " + dateFormat.format(new Date()));

      List<ProcessedTweet> train = null;
      String[] ids = id.split("-");
      cloneID = ids[0] + "-" + (Integer.valueOf(ids[1]) + 6);
      if (((Integer.valueOf(ids[1]) / 6) % 2) == 0) clonar = true;

      if (new File(id + "-train.ptweets").exists()) {
        train = ProcessedTweetSerialization.fromFile(id + "-train.ptweets");
        tweetsProcessor =
            new ProcessTweets(stemming, lematization, urls, hashtags, mentions, unigrams, bigrams);
        if (lematization) {
          tweetsProcessor.doLematization(train);
        }
        if (stemming) {
          tweetsProcessor.doStemming(train);
        }
      } else {
        tweetsProcessor =
            new ProcessTweets(stemming, lematization, urls, hashtags, mentions, unigrams, bigrams);
        // Esto del set training es un añadido para poder diferenciar los idiomas de las url en el
        // corpus paralelo
        //				tweetsProcessor.setTraining(true);
        train = tweetsProcessor.processTweets(traincorpus);
        //				tweetsProcessor.setTraining(false);
        ProcessedTweetSerialization.toFile(id + "-train.ptweets", train);
        /*
        				if (clonar)
        				{
        					File f = new File (id+"-train.ptweets");
        					Path p = f.toPath();
        					CopyOption[] options = new CopyOption[]{
        						      StandardCopyOption.REPLACE_EXISTING,
        						      StandardCopyOption.COPY_ATTRIBUTES
        						     };
        					Files.copy(p, new File (cloneID+"-train.ptweets").toPath(), options);
        					Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+12)+"-train.ptweets").toPath(), options);
        					Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+18)+"-train.ptweets").toPath(), options);
        					Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+24)+"-train.ptweets").toPath(), options);
        					Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+30)+"-train.ptweets").toPath(), options);
        				}
        */
      }

      // Generamos las BOW. Igual que antes, si existen no las creo.
      System.out.println("2-Fill topics " + dateFormat.format(new Date()));
      out.println("2-Fill topics " + dateFormat.format(new Date()));
      TopicsList topics = null;
      if (new File(id + ".topics").exists()) {
        topics = TopicsSerialization.fromFile(id + ".topics");
        if (tf) topics.setSelectionFeature(TopicDesc.TERM_TF);
        else topics.setSelectionFeature(TopicDesc.TERM_TF_IDF);
        topics.prepareTopics();
      } else {

        topics = new TopicsList();
        if (tf) topics.setSelectionFeature(TopicDesc.TERM_TF);
        else topics.setSelectionFeature(TopicDesc.TERM_TF_IDF);
        System.out.println("Filling topics " + dateFormat.format(new Date()));
        topics.fillTopics(train);
        System.out.println("Preparing topics topics " + dateFormat.format(new Date()));
        // Aquí tengo que serializar antes de preparar, porque si no no puedo calcular los tf y
        // tfidf
        System.out.println("Serializing topics topics " + dateFormat.format(new Date()));
        /*
        				if (clonar)
        				{
        					TopicsSerialization.toFile(cloneID+".topics", topics);
        				}
        */
        topics.prepareTopics();
        TopicsSerialization.toFile(id + ".topics", topics);
      }
      System.out.println("3-Generate arff train file " + dateFormat.format(new Date()));
      out.println("3-Generate arff train file " + dateFormat.format(new Date()));

      // Si el fichero arff no existe, lo creo. en caso contrario vengo haciendo lo que hasta ahora,
      // aprovechar trabajo previo
      if (!new File(id + "-train.arff").exists()) {

        BufferedWriter bw = topics.generateArffHeader(id + "-train.arff");
        int tope = traincorpus.size();
        if (tweetsProcessor == null)
          tweetsProcessor =
              new ProcessTweets(
                  stemming, lematization, urls, hashtags, mentions, unigrams, bigrams);
        for (int indTweet = 0; indTweet < tope; indTweet++) {
          topics.generateArffVector(bw, train.get(indTweet));
        }
        bw.flush();
        bw.close();
      }

      // Ahora proceso los datos de test
      System.out.println("5-build test dataset " + dateFormat.format(new Date()));
      out.println("5-build test dataset " + dateFormat.format(new Date()));

      List<ProcessedTweet> test = null;
      if (new File(id + "-test.ptweets").exists())
        test = ProcessedTweetSerialization.fromFile(id + "-test.ptweets");
      else {
        if (tweetsProcessor == null)
          tweetsProcessor =
              new ProcessTweets(
                  stemming, lematization, urls, hashtags, mentions, unigrams, bigrams);
        test = tweetsProcessor.processTweets(testcorpus);
        ProcessedTweetSerialization.toFile(id + "-test.ptweets", test);
        /*
        				if (clonar)
        				{
        					File f = new File (id+"-test.ptweets");
        					Path p = f.toPath();
        					CopyOption[] options = new CopyOption[]{
        						      StandardCopyOption.REPLACE_EXISTING,
        						      StandardCopyOption.COPY_ATTRIBUTES
        						     };
        					Files.copy(p, new File (cloneID+"-test.ptweets").toPath(), options);
        				}
        */

      }

      // Si el fichero arff no existe, lo creo. en caso contrario vengo haciendo lo que hasta ahora,
      // aprovechar trabajo previo
      if (!new File(id + "-test.arff").exists()) {
        BufferedWriter bw = topics.generateArffHeader(id + "-test.arff");
        int tope = testcorpus.size();
        if (tweetsProcessor == null)
          tweetsProcessor =
              new ProcessTweets(
                  stemming, lematization, urls, hashtags, mentions, unigrams, bigrams);
        for (int indTweet = 0; indTweet < tope; indTweet++) {
          topics.generateArffVector(bw, test.get(indTweet));
        }
        bw.flush();
        bw.close();
      }
      int topeTopics = topics.getTopicsList().size();
      topics.getTopicsList().clear();
      // Genero el clasificador
      // FJRM 25-08-2013 Lo cambio de orden para intentar liberar la memoria de los topics y tener
      // más libre
      System.out.println("4-Generate classifier " + dateFormat.format(new Date()));
      out.println("4-Generate classifier " + dateFormat.format(new Date()));

      Classifier cls = null;
      DataSource sourceTrain = null;
      Instances dataTrain = null;
      if (new File(id + "-MNB.classifier").exists()) {
        ObjectInputStream ois = new ObjectInputStream(new FileInputStream(id + "-MNB.classifier"));
        cls = (Classifier) ois.readObject();
        ois.close();
      } else {
        sourceTrain = new DataSource(id + "-train.arff");
        dataTrain = sourceTrain.getDataSet();
        if (dataTrain.classIndex() == -1) dataTrain.setClassIndex(dataTrain.numAttributes() - 1);
        // Entreno el clasificador
        cls = new weka.classifiers.bayes.NaiveBayesMultinomial();
        int clase = dataTrain.numAttributes() - 1;
        dataTrain.setClassIndex(clase);
        cls.buildClassifier(dataTrain);
        ObjectOutputStream oos =
            new ObjectOutputStream(new FileOutputStream(id + "-MNB.classifier"));
        oos.writeObject(cls);
        oos.flush();
        oos.close();
        // data.delete();//no borro para el svm
      }
      // Ahora evaluo el clasificador con los datos de test
      System.out.println("6-Evaluate classifier MNB " + dateFormat.format(new Date()));
      out.println("6-Evaluate classifier MNB" + dateFormat.format(new Date()));
      DataSource sourceTest = new DataSource(id + "-test.arff");
      Instances dataTest = sourceTest.getDataSet();
      int clase = dataTest.numAttributes() - 1;
      dataTest.setClassIndex(clase);
      Evaluation eval = new Evaluation(dataTest);
      eval.evaluateModel(cls, dataTest);
      // Ahora calculo los valores precision, recall y fmeasure. Además saco las matrices de
      // confusion

      precision = 0;
      recall = 0;
      fmeasure = 0;
      for (int ind = 0; ind < topeTopics; ind++) {
        precision += eval.precision(ind);
        recall += eval.recall(ind);
        fmeasure += eval.fMeasure(ind);
      }
      precision = precision / topeTopics;
      recall = recall / topeTopics;
      fmeasure = fmeasure / topeTopics;
      System.out.println("+++++++++++++++++++++++++++++++++++++++");
      System.out.println(eval.toMatrixString());
      System.out.println("+++++++++++++++++++++++++++++++++++++++");
      System.out.printf("Precision: %.3f\n", precision);
      System.out.printf("Recall: %.3f\n", recall);
      System.out.printf("F-measure: %.3f\n", fmeasure);
      System.out.println("***************************************");
      out.println("+++++++++++++++++++++++++++++++++++++++");
      out.println(eval.toMatrixString());
      out.println("+++++++++++++++++++++++++++++++++++++++");
      out.printf("Precision: %.3f\n", precision);
      out.printf("Recall: %.3f\n", recall);
      out.printf("F-measure: %.3f\n", fmeasure);
      out.println("***************************************");
      /*			NO BORRAR
      			System.out.println("7-Evaluate classifier SVM"+dateFormat.format(new Date()));
      			out.println("7-Evaluate classifier SVM"+dateFormat.format(new Date()));
      			if (new File(id+"-SVM.classifier").exists())
      			{
      				ObjectInputStream ois = new ObjectInputStream(new FileInputStream(id+"-SVM.classifier"));
      				cls = (Classifier) ois.readObject();
      				ois.close();
      			}
      			else
      			{
      				if (dataTrain==null)
      				{
      					sourceTrain = new DataSource(id+"-train.arff");
      					dataTrain = sourceTrain.getDataSet();
      					if (dataTrain.classIndex() == -1)
      						dataTrain.setClassIndex(dataTrain.numAttributes() - 1);
      				}
      	//Entreno el clasificador
      				cls = new weka.classifiers.functions.LibSVM();
      				clase = dataTrain.numAttributes()-1;
      				dataTrain.setClassIndex(clase);
      				cls.buildClassifier(dataTrain);
      				ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(id+"-SVM.classifier"));
      				oos.writeObject(cls);
      				oos.flush();
      				oos.close();
      				dataTrain.delete();
      			}
      			eval.evaluateModel(cls, dataTest);
      			precision=0;
      			recall=0;
      			fmeasure=0;
      			for(int ind=0; ind<topeTopics; ind++)
      			{
      				precision += eval.precision(ind);
      				recall += eval.recall(ind);
      				fmeasure += eval.fMeasure(ind);
      			}
      			precision = precision / topeTopics;
      			recall = recall / topeTopics;
      			fmeasure = fmeasure / topeTopics;
      			System.out.println("+++++++++++++++++++++++++++++++++++++++");
      			System.out.println(eval.toMatrixString());
      			System.out.println("+++++++++++++++++++++++++++++++++++++++");
      			System.out.printf("Precision: %.3f\n", precision);
      			System.out.printf("Recall: %.3f\n", recall);
      			System.out.printf("F-measure: %.3f\n", fmeasure);
      			System.out.println("***************************************");
      			out.println("+++++++++++++++++++++++++++++++++++++++");
      			out.println(eval.toMatrixString());
      			out.println("+++++++++++++++++++++++++++++++++++++++");
      			out.printf("Precision: %.3f\n", precision);
      			out.printf("Recall: %.3f\n", recall);
      			out.printf("F-measure: %.3f\n", fmeasure);
      			out.println("***************************************");
      */
      System.out.println("Done " + dateFormat.format(new Date()));
      out.println("Done " + dateFormat.format(new Date()));
      if (printer == null) {
        out.flush();
        out.close();
      }
      // Intento de liberar memoria
      if (dataTrain != null) dataTrain.delete();
      if (dataTest != null) dataTest.delete();
      if (train != null) train.clear();
      if (test != null) test.clear();
      if (topics != null) {
        topics.getTopicsList().clear();
        topics = null;
      }
      if (dataTest != null) dataTest.delete();
      if (cls != null) cls = null;
      if (tweetsProcessor != null) tweetsProcessor = null;
      System.gc();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
예제 #10
0
 /**
  * Removes a specified number of instances from the given set of instances.
  *
  * @param data given instances
  * @param numRemove number of instances to delete from the given instances
  */
 protected void removeInstances(Instances data, int numRemove) {
   int num = data.numInstances();
   for (int i = num - 1; i > num - 1 - numRemove; i--) {
     data.delete(i);
   }
 }
예제 #11
0
  /**
   * Calculates the class membership probabilities for the given test instance.
   *
   * @param instance the instance to be classified
   * @return predicted class probability distribution
   * @throws Exception if an error occurred during the prediction
   */
  public double[] distributionForInstance(Instance instance) throws Exception {

    NaiveBayes nb = new NaiveBayes();

    // System.out.println("number of instances		"+m_Train.numInstances());

    if (m_Train.numInstances() == 0) {
      // throw new Exception("No training instances!");
      return m_defaultModel.distributionForInstance(instance);
    }
    if ((m_WindowSize > 0) && (m_Train.numInstances() > m_WindowSize)) {
      m_kNNValid = false;
      boolean deletedInstance = false;
      while (m_Train.numInstances() > m_WindowSize) {
        m_Train.delete(0);
      }
      // rebuild datastructure KDTree currently can't delete
      if (deletedInstance == true) m_NNSearch.setInstances(m_Train);
    }

    // Select k by cross validation
    if (!m_kNNValid && (m_CrossValidate) && (m_kNNUpper >= 1)) {
      crossValidate();
    }

    m_NNSearch.addInstanceInfo(instance);
    m_kNN = 1000;
    Instances neighbours = m_NNSearch.kNearestNeighbours(instance, m_kNN);
    double[] distances = m_NNSearch.getDistances();

    // System.out.println("--------------classify instance--------- ");
    // System.out.println("neighbours.numInstances"+neighbours.numInstances());
    // System.out.println("distances.length"+distances.length);
    // System.out.println("--------------classify instance--------- ");

    /*	for (int k = 0; k < distances.length; k++) {
    	//System.out.println("-------");
    	//System.out.println("distance of "+k+"	"+distances[k]);
    	//System.out.println("instance of "+k+"	"+neighbours.instance(k));
    	//distances[k] = distances[k]+0.1;
    	//System.out.println("------- after add 0.1");
    	//System.out.println("distance of "+k+"	"+distances[k]);
    }
    */
    Instances instances = new Instances(m_Train);
    // int attrnum = instances.numAttributes();
    instances.deleteWithMissingClass();

    Instances newm_Train = new Instances(instances, 0, instances.numInstances());

    for (int k = 0; k < neighbours.numInstances(); k++) {
      // System.out.println("-------");
      // Instance in = new Instance();
      Instance insk = neighbours.instance(k);
      // System.out.println("instance "+k+"	"+neighbours.instance(k));
      // System.out.println("-------");
      double dis = distances[k] + 0.1;
      // System.out.println("dis		"+dis);
      dis = (1 / dis) * 10;
      // System.out.println("1/dis		"+dis);
      int weightnum = (int) dis;
      // System.out.println("weightnum		"+weightnum);

      for (int s = 0; s < weightnum; s++) {

        newm_Train.add(insk);
      }
    }

    // System.out.println("number of instances		"+newm_Train.numInstances());

    /*  for (int k = 0; k < newm_Train.numInstances(); k++) {
    		System.out.println("-------");
    		System.out.println("instance "+k+"	"+newm_Train.instance(k));
    		System.out.println("-------");
    	}

    /*
    	for (int k = 0; k < distances.length; k++) {
    		System.out.println("-------");
    		System.out.println("distance of "+k+"	"+distances[k]);
    		System.out.println("-------");
    	}*/

    nb.buildClassifier(newm_Train);
    double[] dis = nb.distributionForInstance(instance);
    // double[] distribution = makeDistribution(neighbours, distances);
    return dis;
    // return distribution;
  }