Esempio n. 1
0
  /**
   * @param targetTerm
   * @param sourceFile
   * @param trainingAlgo
   * @param outputFileClassifier
   * @param outputFileResults
   * @param termWindowSize
   * @param pipe
   * @return
   */
  private static List<ClassificationResult> runTrainingAndClassification(
      String targetTerm,
      String sourceFile,
      String trainingAlgo,
      String outputFileClassifier,
      String outputFileResults,
      int termWindowSize,
      Pipe pipe,
      boolean useCollocationalVector) {
    // Read in concordance file and create list of Mallet training instances
    // TODO: Remove duplication of code (see execConvertToMalletFormat(...))
    String vectorType = useCollocationalVector ? "coll" : "bow";

    InstanceList instanceList =
        readConcordanceFileToInstanceList(
            targetTerm, sourceFile, termWindowSize, pipe, useCollocationalVector);

    // Creating splits for training and testing
    double[] proportions = {0.9, 0.1};
    InstanceList[] splitLists = instanceList.split(proportions);
    InstanceList trainingList = splitLists[0];
    InstanceList testList = splitLists[1];

    // Train the classifier
    ClassifierTrainer classifierTrainer = getClassifierTrainerForAlgorithm(trainingAlgo);

    Classifier classifier = classifierTrainer.train(trainingList);
    if (classifier.getLabelAlphabet()
        != null) { // TODO: Make sure this is not null in RandomClassifier
      System.out.println("Labels:\n" + classifier.getLabelAlphabet());
      System.out.println(
          "Size of data alphabet (= type count of training list): "
              + classifier.getAlphabet().size());
    }

    // Run tests and get results
    Trial trial = new Trial(classifier, testList);
    List<ClassificationResult> results = new ArrayList<ClassificationResult>();

    for (int i = 0; i < classifier.getLabelAlphabet().size(); i++) {
      Label label = classifier.getLabelAlphabet().lookupLabel(i);
      ClassificationResult result =
          new MalletClassificationResult(
              trainingAlgo,
              targetTerm,
              vectorType,
              label.toString(),
              termWindowSize,
              trial,
              sourceFile);
      results.add(result);

      System.out.println(result.toString());
    }

    // Save classifier
    saveClassifierToFile(outputFileClassifier, classifier, trainingAlgo, termWindowSize);

    return results;
  }
  public static void trainAndSaveOnWholeData(InstanceList ilist, ClassifierTrainer trainer)
      throws IOException {

    Classifier classifier = trainer.train(ilist);
    String outputFilename = "Models/" + classifier.toString();
    saveClassifier(classifier, new File(outputFilename));
  }
  // in the training feature table
  // Lines should be formatted as:
  //
  //   [name] [label] [data ... ]
  //
  public static Classifier TrainMaxent(String trainingFilename, File modelFile) throws IOException {
    // build data input pipe
    ArrayList<Pipe> pipes = new ArrayList<Pipe>();

    // define pipe
    // the features in [data ...] should like: feature:value
    pipes.add(new Target2Label());
    pipes.add(new Csv2FeatureVector());

    Pipe pipe = new SerialPipes(pipes);
    pipe.setTargetProcessing(true);

    // read data
    InstanceList trainingInstances = new InstanceList(pipe);
    FileReader training_file_reader = new FileReader(trainingFilename);
    CsvIterator reader =
        new CsvIterator(
            training_file_reader,
            "(\\w+)\\s+([^\\s]+)\\s+(.*)",
            3,
            2,
            1); // (data, label, name) field indices
    trainingInstances.addThruPipe(reader);
    training_file_reader.close();

    // calculate running time
    long startTime = System.currentTimeMillis();
    PrintStream temp = System.err;
    System.setErr(System.out);

    // train a Maxent classifier (could be other classifiers)
    ClassifierTrainer trainer = new MaxEntTrainer(Gaussian_Variance);
    Classifier classifier = trainer.train(trainingInstances);

    System.setErr(temp);
    // calculate running time
    long endTime = System.currentTimeMillis();
    long totalTime = endTime - startTime;
    System.out.println("Total training time: " + totalTime);

    // write model
    ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(modelFile));
    oos.writeObject(classifier);
    oos.close();

    return classifier;
  }
 public Classifier train(InstanceList trainingInstances, InstanceList testingInstances) {
   classifier = trainer.train(trainingInstances);
   printPerformance(trainingInstances, "Training\n=======");
   if (testingInstances != null) {
     printPerformance(testingInstances, "Testing\n=======");
   }
   return classifier;
 }
  public void trainClassifier(File dir, String... args) throws Exception {

    InstanceListCreator instanceListCreator = new InstanceListCreator();
    InstanceList instanceList = instanceListCreator.createInstanceList(getTrainingDataFile(dir));
    instanceList.save(new File(dir, "training-data.ser"));

    String factoryName = args[0];
    Class<ClassifierTrainerFactory<?>> factoryClass = createTrainerFactory(factoryName);
    if (factoryClass == null) {
      String factoryName2 = "org.cleartk.ml.mallet.factory." + factoryName + "TrainerFactory";
      factoryClass = createTrainerFactory(factoryName2);
    }
    if (factoryClass == null) {
      throw new IllegalArgumentException(
          String.format(
              "name for classifier trainer factory is not valid: name given ='%s'.  Valid classifier names include: %s, %s, %s, and %s",
              factoryName,
              ClassifierTrainerFactory.NAMES[0],
              ClassifierTrainerFactory.NAMES[1],
              ClassifierTrainerFactory.NAMES[2],
              ClassifierTrainerFactory.NAMES[3]));
    }

    String[] factoryArgs = new String[args.length - 1];
    System.arraycopy(args, 1, factoryArgs, 0, factoryArgs.length);

    ClassifierTrainerFactory<?> factory = factoryClass.newInstance();
    ClassifierTrainer<?> trainer = null;
    try {
      trainer = factory.createTrainer(factoryArgs);
    } catch (Throwable t) {
      throw new IllegalArgumentException(
          "Unable to create trainer.  Usage for "
              + factoryClass.getCanonicalName()
              + ": "
              + factory.getUsageMessage(),
          t);
    }

    this.classifier = trainer.train(instanceList);

    ObjectOutputStream oos =
        new ObjectOutputStream(new FileOutputStream(new File(dir, MODEL_NAME)));
    oos.writeObject(classifier);
    oos.close();
  }
 public Classifier train(InstanceList trainingInstances) {
   classifier = trainer.train(trainingInstances);
   writeExtremeFeatures();
   return classifier;
 }
  public static ArrayList<Double> getAverageCrossValidationScore(
      InstanceList ilist, int i, ClassifierTrainer trainer) {

    double crossValidAccSum = 0;
    double crossValidPrcSum = 0;
    double crossValidRecSum = 0;
    double crossValidF1Sum = 0;

    int count = 0;

    // get gross validation folds
    CrossValidationIterator cvIlists = ilist.crossValidationIterator(i);

    while (cvIlists.hasNext()) {

      System.out.println("#############Performing " + count + " iteration###########");

      InstanceList[] ilists = cvIlists.next();

      System.out.println("The train set size is " + ilists[0].size());
      System.out.println("The test set size is " + ilists[1].size());
      Classifier classifier = trainer.train(ilists[0]);
      System.out.println("The training accuracy is " + classifier.getAccuracy(ilists[0]));
      System.out.println("The testing accuracy is " + classifier.getAccuracy(ilists[1]));
      System.out.println("The testing precision is " + classifier.getPrecision(ilists[1], 1));
      System.out.println("The testing recall is " + classifier.getRecall(ilists[1], 1));
      System.out.println("The testing f1score is " + classifier.getF1(ilists[1], 1));

      crossValidAccSum += classifier.getAccuracy(ilists[1]);
      crossValidPrcSum += classifier.getPrecision(ilists[1], 1);
      crossValidRecSum += classifier.getRecall(ilists[1], 1);
      crossValidF1Sum += classifier.getF1(ilists[1], 1);
      count++;

      // additional calculations
      ArrayList<Classification> outClassifications = classifier.classify(ilists[1]);
      int p1l1 = 0;
      int p1l0 = 0;
      int p0l1 = 0;
      int p0l0 = 0;
      int countCorrect = 0;
      int countIncorrect = 0;

      System.out.println("Outclassification size " + outClassifications.size());
      for (int k = 0; k < outClassifications.size(); k++) {

        // System.out.println("Data "+outClassifications.get(k).getInstance().getName());
        // System.out.println("Labeling "+outClassifications.get(k).getLabeling()); uncomment to get
        // score
        double predictedLabel = outClassifications.get(k).getLabeling().getBestIndex();
        // System.out.println("Predicted label "+ predictedLabel);
        double targetLabel =
            Double.valueOf(outClassifications.get(k).getInstance().getTarget().toString());
        // System.out.println("Target "+ targetLabel);
        boolean bestlabelIsCorrect = outClassifications.get(k).bestLabelIsCorrect();
        // System.out.println("Prediction "+bestlabelIsCorrect);

        if (bestlabelIsCorrect) countCorrect++;
        else countIncorrect++;

        if ((predictedLabel == 1.0) && (targetLabel == 1.0)) p1l1++;
        else if ((predictedLabel == 1.0) && (targetLabel == 0.0)) p1l0++;
        else if ((predictedLabel == 0.0) && (targetLabel == 1.0)) p0l1++;
        else if ((predictedLabel == 0.0) && (targetLabel == 0.0)) p0l0++;
      }

      System.out.println("Count Correct " + countCorrect);
      System.out.println("Count Incorrect " + countIncorrect);
      System.out.println("p1l1 " + p1l1);
      System.out.println("p1l0 " + p1l0);
      System.out.println("p0l1 " + p0l1);
      System.out.println("p0l0 " + p0l0);
    }

    ArrayList<Double> results = new ArrayList<Double>();
    double crossValidAccAvg = crossValidAccSum / count;
    double crossValidPrcAvg = crossValidPrcSum / count;
    double crossValidRecAvg = crossValidRecSum / count;
    double crossValidF1Avg = crossValidF1Sum / count;

    results.add(crossValidAccAvg);
    results.add(crossValidPrcAvg);
    results.add(crossValidRecAvg);
    results.add(crossValidF1Avg);

    return results;
  }