/**
   * Initialize this separate model using a complete list.
   *
   * @param documents
   * @param testStartIndex
   */
  public void divideDocuments(InstanceList documents, int testStartIndex) {
    Alphabet dataAlpha = documents.getDataAlphabet();
    Alphabet targetAlpha = documents.getTargetAlphabet();

    this.training = new InstanceList(dataAlpha, targetAlpha);
    this.test = new InstanceList(dataAlpha, targetAlpha);
    int di = 0;
    for (di = 0; di < testStartIndex; di++) {
      training.add(documents.get(di));
    }
    for (di = testStartIndex; di < documents.size(); di++) {
      test.add(documents.get(di));
    }
  }
Example #2
0
  public static CRF4 createCRF(File trainingFile, CRFInfo crfInfo) throws FileNotFoundException {
    Reader trainingFileReader = new FileReader(trainingFile);

    // Create a pipe that we can use to convert the training
    // file to a feature vector sequence.
    Pipe p = new SimpleTagger.SimpleTaggerSentence2FeatureVectorSequence();

    // The training file does contain tags (aka targets)
    p.setTargetProcessing(true);

    // Register the default tag with the pipe, by looking it up
    // in the targetAlphabet before we look up any other tag.
    p.getTargetAlphabet().lookupIndex(crfInfo.defaultLabel);

    // Create a new instancelist to hold the training data.
    InstanceList trainingData = new InstanceList(p);

    // Read in the training data.
    trainingData.add(new LineGroupIterator(trainingFileReader, Pattern.compile("^\\s*$"), true));

    // Create the CRF model.
    CRF4 crf = new CRF4(p, null);

    // Set various config options
    crf.setGaussianPriorVariance(crfInfo.gaussianVariance);
    crf.setTransductionType(crfInfo.transductionType);

    // Set up the model's states.
    if (crfInfo.stateInfoList != null) {
      Iterator stateIter = crfInfo.stateInfoList.iterator();
      while (stateIter.hasNext()) {
        CRFInfo.StateInfo state = (CRFInfo.StateInfo) stateIter.next();
        crf.addState(
            state.name,
            state.initialCost,
            state.finalCost,
            state.destinationNames,
            state.labelNames,
            state.weightNames);
      }
    } else if (crfInfo.stateStructure == CRFInfo.FULLY_CONNECTED_STRUCTURE)
      crf.addStatesForLabelsConnectedAsIn(trainingData);
    else if (crfInfo.stateStructure == CRFInfo.HALF_CONNECTED_STRUCTURE)
      crf.addStatesForHalfLabelsConnectedAsIn(trainingData);
    else if (crfInfo.stateStructure == CRFInfo.THREE_QUARTERS_CONNECTED_STRUCTURE)
      crf.addStatesForThreeQuarterLabelsConnectedAsIn(trainingData);
    else if (crfInfo.stateStructure == CRFInfo.BILABELS_STRUCTURE)
      crf.addStatesForBiLabelsConnectedAsIn(trainingData);
    else throw new RuntimeException("Unexpected state structure " + crfInfo.stateStructure);

    // Set up the weight groups.
    if (crfInfo.weightGroupInfoList != null) {
      Iterator wgIter = crfInfo.weightGroupInfoList.iterator();
      while (wgIter.hasNext()) {
        CRFInfo.WeightGroupInfo wg = (CRFInfo.WeightGroupInfo) wgIter.next();
        FeatureSelection fs =
            FeatureSelection.createFromRegex(
                crf.getInputAlphabet(), Pattern.compile(wg.featureSelectionRegex));
        crf.setFeatureSelection(crf.getWeightsIndex(wg.name), fs);
      }
    }

    // Train the CRF.
    crf.train(trainingData, null, null, null, crfInfo.maxIterations);

    return crf;
  }