public void add(CNGramsStats nGramsFreqStats) { String label = nGramsFreqStats.getLabel(); if (label.split(" ").length > 1) { _validationNGramRankList.add(Integer.valueOf(nGramsFreqStats.getNGramRank())); _validationNGramNNPList.add(Boolean.valueOf(nGramsFreqStats.isNNP())); } else { _validation1GramRankList.add(Integer.valueOf(nGramsFreqStats.getNGramRank())); _validation1GramNNPList.add(Boolean.valueOf(nGramsFreqStats.isNNP())); } }
public void add(CNGramsStats nGramsFreqStats) { String label = nGramsFreqStats.getLabel(); float tagEstimator = CNGramsModel.getInstance().getTagEstimator(); float maxTermFreqEstimator = CNGramsModel.getInstance().getMaxTermFreqEstimator(); double[] allDataset = new double[5]; double isNNPValue = nGramsFreqStats.isNNP() ? 1.0 : 0.0; allDataset[0] = nGramsFreqStats.getNGramRank(); allDataset[1] = tagEstimator; allDataset[2] = maxTermFreqEstimator; allDataset[3] = isNNPValue; allDataset[4] = (label.split(" ").length > 1) ? 1.0 : 0.0; if (label.split(" ").length > 1) { allDataset[4] = 1.0; double[] nGramDataset = new double[4]; nGramDataset[0] = nGramsFreqStats.getNGramRank(); nGramDataset[1] = tagEstimator; nGramDataset[2] = maxTermFreqEstimator; nGramDataset[3] = isNNPValue; _nGramDatasetsList.add(nGramDataset); } else { allDataset[4] = 0.0; double[] oneGramDataset = new double[4]; oneGramDataset[0] = nGramsFreqStats.getNGramRank(); oneGramDataset[1] = tagEstimator; oneGramDataset[2] = maxTermFreqEstimator; oneGramDataset[3] = isNNPValue; _oneGramDatasetsList.add(oneGramDataset); } _allDatasetsList.add(allDataset); }
/** Compute the key parameters of the NGramsOrder model. */ private void compute() { int oneGramsCounter = 0; int nGramsCounter = 0; /* * initialize the statistics for NGram */ NLabeledNGramsStats[] nGramsStat = new NLabeledNGramsStats[MAX_ORDER_NGRAM_STATS]; NLabeled1GramsStats[] oneGramsStat = new NLabeled1GramsStats[MAX_ORDER_NGRAM_STATS]; for (int k = 0; k < MAX_ORDER_NGRAM_STATS; k++) { nGramsStat[k] = new NLabeledNGramsStats(k); oneGramsStat[k] = new NLabeled1GramsStats(k); } int nGramRank = 0; for (CNGramsStats nGramsFrequencyStat : _nGramsFrequencyStatsList) { nGramRank = nGramsFrequencyStat.getNGramRank(); /* * Collect only lower order NGram labels */ if (nGramRank < MAX_ORDER_NGRAM_STATS) { if (nGramsFrequencyStat.isCompound()) { nGramsStat[nGramRank].add( nGramsFrequencyStat.getNumNGramOccurrences(), nGramsFrequencyStat.isNNP(), nGramsFrequencyStat.getMaxNumTermOccurrences()); nGramsCounter++; } else { oneGramsStat[nGramRank].add( nGramsFrequencyStat.getNumNGramOccurrences(), nGramsFrequencyStat.isNNP(), -1); oneGramsCounter++; } } } CNGramsModel instance = CNGramsModel.getInstance(); /* * Normalize to compute the average values.. */ StringBuilder buf = new StringBuilder(); buf.append(CFileUtil.COMMENTS_FIRST_CHAR); buf.append(" nnpFudge:"); buf.append(instance.getTagEstimator()); buf.append(" compoundFudge:"); buf.append(instance.getMaxTermFreqEstimator()); buf.append("\n"); buf.append(CFileUtil.COMMENTS_FIRST_CHAR); buf.append("1-Gram frequency model parameters\n"); for (int k = 0; k < MAX_ORDER_NGRAM_STATS; k++) { if (oneGramsStat[k].compute(oneGramsCounter)) { buf.append(oneGramsStat[k].toString()); buf.append("\n"); } } buf.append(CEnv.ENTRIES_DELIM); buf.append("\n"); buf.append(CFileUtil.COMMENTS_FIRST_CHAR); buf.append("N-Gram frequency model parameters\n"); for (int k = 0; k < MAX_ORDER_NGRAM_STATS; k++) { if (nGramsStat[k].compute(nGramsCounter)) { buf.append(nGramsStat[k].toString()); buf.append("\n"); } } buf.append(CEnv.ENTRIES_DELIM); try { final String fileName = TRAINING_FILES + "_" + _fileIndex; CFileUtil.write(fileName, buf.toString()); } catch (IOException e) { CLogger.error(e.toString()); } }