/** * Initialize this separate model using a complete list. * * @param documents * @param testStartIndex */ public void divideDocuments(InstanceList documents, int testStartIndex) { Alphabet dataAlpha = documents.getDataAlphabet(); Alphabet targetAlpha = documents.getTargetAlphabet(); this.training = new InstanceList(dataAlpha, targetAlpha); this.test = new InstanceList(dataAlpha, targetAlpha); int di = 0; for (di = 0; di < testStartIndex; di++) { training.add(documents.get(di)); } for (di = testStartIndex; di < documents.size(); di++) { test.add(documents.get(di)); } }
public void addDocuments( InstanceList additionalDocuments, int numIterations, int showTopicsInterval, int outputModelInterval, String outputModelFilename, Randoms r) { if (ilist == null) throw new IllegalStateException("Must already have some documents first."); for (Instance inst : additionalDocuments) ilist.add(inst); assert (ilist.getDataAlphabet() == additionalDocuments.getDataAlphabet()); assert (additionalDocuments.getDataAlphabet().size() >= numTypes); numTypes = additionalDocuments.getDataAlphabet().size(); int numNewDocs = additionalDocuments.size(); int numOldDocs = topics.length; int numDocs = numOldDocs + numNewDocs; // Expand various arrays to make space for the new data. int[][] newTopics = new int[numDocs][]; for (int i = 0; i < topics.length; i++) newTopics[i] = topics[i]; topics = newTopics; // The rest of this array will be initialized below. int[][] newDocTopicCounts = new int[numDocs][numTopics]; for (int i = 0; i < docTopicCounts.length; i++) newDocTopicCounts[i] = docTopicCounts[i]; docTopicCounts = newDocTopicCounts; // The rest of this array will be initialized below. int[][] newTypeTopicCounts = new int[numTypes][numTopics]; for (int i = 0; i < typeTopicCounts.length; i++) for (int j = 0; j < numTopics; j++) newTypeTopicCounts[i][j] = typeTopicCounts[i][j]; // This array further populated below FeatureSequence fs; for (int di = numOldDocs; di < numDocs; di++) { try { fs = (FeatureSequence) additionalDocuments.get(di - numOldDocs).getData(); } catch (ClassCastException e) { System.err.println( "LDA and other topic models expect FeatureSequence data, not FeatureVector data. " + "With text2vectors, you can obtain such data with --keep-sequence or --keep-bisequence."); throw e; } int seqLen = fs.getLength(); numTokens += seqLen; topics[di] = new int[seqLen]; // Randomly assign tokens to topics for (int si = 0; si < seqLen; si++) { int topic = r.nextInt(numTopics); topics[di][si] = topic; docTopicCounts[di][topic]++; typeTopicCounts[fs.getIndexAtPosition(si)][topic]++; tokensPerTopic[topic]++; } } }
public static CRF4 createCRF(File trainingFile, CRFInfo crfInfo) throws FileNotFoundException { Reader trainingFileReader = new FileReader(trainingFile); // Create a pipe that we can use to convert the training // file to a feature vector sequence. Pipe p = new SimpleTagger.SimpleTaggerSentence2FeatureVectorSequence(); // The training file does contain tags (aka targets) p.setTargetProcessing(true); // Register the default tag with the pipe, by looking it up // in the targetAlphabet before we look up any other tag. p.getTargetAlphabet().lookupIndex(crfInfo.defaultLabel); // Create a new instancelist to hold the training data. InstanceList trainingData = new InstanceList(p); // Read in the training data. trainingData.add(new LineGroupIterator(trainingFileReader, Pattern.compile("^\\s*$"), true)); // Create the CRF model. CRF4 crf = new CRF4(p, null); // Set various config options crf.setGaussianPriorVariance(crfInfo.gaussianVariance); crf.setTransductionType(crfInfo.transductionType); // Set up the model's states. if (crfInfo.stateInfoList != null) { Iterator stateIter = crfInfo.stateInfoList.iterator(); while (stateIter.hasNext()) { CRFInfo.StateInfo state = (CRFInfo.StateInfo) stateIter.next(); crf.addState( state.name, state.initialCost, state.finalCost, state.destinationNames, state.labelNames, state.weightNames); } } else if (crfInfo.stateStructure == CRFInfo.FULLY_CONNECTED_STRUCTURE) crf.addStatesForLabelsConnectedAsIn(trainingData); else if (crfInfo.stateStructure == CRFInfo.HALF_CONNECTED_STRUCTURE) crf.addStatesForHalfLabelsConnectedAsIn(trainingData); else if (crfInfo.stateStructure == CRFInfo.THREE_QUARTERS_CONNECTED_STRUCTURE) crf.addStatesForThreeQuarterLabelsConnectedAsIn(trainingData); else if (crfInfo.stateStructure == CRFInfo.BILABELS_STRUCTURE) crf.addStatesForBiLabelsConnectedAsIn(trainingData); else throw new RuntimeException("Unexpected state structure " + crfInfo.stateStructure); // Set up the weight groups. if (crfInfo.weightGroupInfoList != null) { Iterator wgIter = crfInfo.weightGroupInfoList.iterator(); while (wgIter.hasNext()) { CRFInfo.WeightGroupInfo wg = (CRFInfo.WeightGroupInfo) wgIter.next(); FeatureSelection fs = FeatureSelection.createFromRegex( crf.getInputAlphabet(), Pattern.compile(wg.featureSelectionRegex)); crf.setFeatureSelection(crf.getWeightsIndex(wg.name), fs); } } // Train the CRF. crf.train(trainingData, null, null, null, crfInfo.maxIterations); return crf; }