private int executeRun(List<String[]> entriesList, ISets sets) throws SQLException { int counter = 0; int recordIndex = 0; for (String[] fields : entriesList) { if (sets.isValid(fields)) { try { CNGramsStats nGramsFrequencyStat = extractFragment(fields[1], fields[0]); /* * Adds this new observation to the regression or classifier. */ if (nGramsFrequencyStat != null) { sets.add(nGramsFrequencyStat); counter++; } } catch (SemanticAnalysisException e) { CLogger.error(e.toString()); } } recordIndex++; } sets.compute(); return counter; }
/** * Execute the validation of the N-Gram frequency model against the Wikipedia reference database. * * @param startIndex index of the first record in the Wikipedia reference database * @param endIndex index of the last record in the Wikipedia reference database * @return Number of training sets created * @throws SQLException if Wikipedia database is unavailable */ public int regress(float[] tagTestValues, float[] maxNumTermTestValues, List<String[]> fieldsList) throws SQLException { int numRecords = -1; NValidationSets validationSets = null; StringBuilder buf = new StringBuilder("tagEstimator,maxTermFreqEstimator,meanNNP,meanNotNNP,meanAll\n"); for (float tagTestValue : tagTestValues) { for (float maxNumTermTestValue : maxNumTermTestValues) { CNGramsModel.getInstance().setTagEstimator(tagTestValue); CNGramsModel.getInstance().setMaxTermFreqEstimator(maxNumTermTestValue); validationSets = new NValidationSets(); numRecords = executeRun(fieldsList, validationSets); buf.append(validationSets.toString()); } } try { CTfIdfScore.ITfDiscriminant discriminant = CTfIdfScore.getInstance().getTfDiscrimant(); final String fileName = REGRESSION_FILES + "_" + discriminant.getName(); CFileUtil.write(fileName, buf.toString()); } catch (IOException e) { CLogger.error(e.toString()); } return numRecords; }
public int cluster(float[] tagTestValues, float[] maxNumTermTestValues, int numClusters) throws SQLException { List<CDataPoint> allDataPointsList = new ArrayList<CDataPoint>(); List<CDataPoint> oneGramDataPointsList = new ArrayList<CDataPoint>(); List<CDataPoint> nGramDataPointsList = new ArrayList<CDataPoint>(); int numRecords = 0; NClusteringSets clusteringSets = null; int dataIndex = 0; CDbpediaSql query = CDbpediaSql.getInstance(); query.setQuery(new String[] {"label", "lgabstract"}, null); List<String[]> fieldsList = query.execute(_startIndex, _endIndex - _startIndex); for (float tagTestValue : tagTestValues) { for (float maxNumTermTestValue : maxNumTermTestValues) { CNGramsModel.getInstance().setTagEstimator(tagTestValue); CNGramsModel.getInstance().setMaxTermFreqEstimator(maxNumTermTestValue); clusteringSets = new NClusteringSets(); numRecords += executeRun(fieldsList, clusteringSets); CDataPoint datapoint = null; dataIndex = allDataPointsList.size(); List<double[]> valuesList = clusteringSets.getAllDatasetsList(); for (double[] values : valuesList) { datapoint = new CDataPoint(values, dataIndex); allDataPointsList.add(datapoint); } dataIndex = oneGramDataPointsList.size(); valuesList = clusteringSets.getOneGramDatasetsList(); for (double[] values : valuesList) { datapoint = new CDataPoint(values, dataIndex); oneGramDataPointsList.add(datapoint); } dataIndex = nGramDataPointsList.size(); valuesList = clusteringSets.getNGramDatasetsList(); for (double[] values : valuesList) { datapoint = new CDataPoint(values, dataIndex++); nGramDataPointsList.add(datapoint); } } } /* * Aggregate the clustering results for all N-Grams, 1-Grams and * N-Grams with N >1 */ StringBuilder resultsBuf = new StringBuilder(); CKMeansClustering kmeanClustering = null; if (allDataPointsList.size() > 0) { kmeanClustering = new CKMeansClustering(numClusters, 500, allDataPointsList); kmeanClustering.train(); resultsBuf.append("All NGrams\n"); resultsBuf.append(kmeanClustering.toString()); } if (oneGramDataPointsList.size() > 0) { kmeanClustering = new CKMeansClustering(numClusters, 500, oneGramDataPointsList); kmeanClustering.train(); resultsBuf.append("\n1-Grams only\n"); resultsBuf.append(kmeanClustering.toString()); } if (nGramDataPointsList.size() > 0) { kmeanClustering = new CKMeansClustering(numClusters, 500, nGramDataPointsList); kmeanClustering.train(); resultsBuf.append("\nN-Grams only\n"); resultsBuf.append(kmeanClustering.toString()); } /* * Save the clustering results ... */ try { CTfIdfScore.ITfDiscriminant discriminant = CTfIdfScore.getInstance().getTfDiscrimant(); final String fileName = CLUSTER_FILES + "_" + discriminant.getName() + "_" + String.valueOf(numClusters); CFileUtil.write(fileName, resultsBuf.toString()); } catch (IOException e) { CLogger.error(e.toString()); } return numRecords; }
/** Compute the key parameters of the NGramsOrder model. */ private void compute() { int oneGramsCounter = 0; int nGramsCounter = 0; /* * initialize the statistics for NGram */ NLabeledNGramsStats[] nGramsStat = new NLabeledNGramsStats[MAX_ORDER_NGRAM_STATS]; NLabeled1GramsStats[] oneGramsStat = new NLabeled1GramsStats[MAX_ORDER_NGRAM_STATS]; for (int k = 0; k < MAX_ORDER_NGRAM_STATS; k++) { nGramsStat[k] = new NLabeledNGramsStats(k); oneGramsStat[k] = new NLabeled1GramsStats(k); } int nGramRank = 0; for (CNGramsStats nGramsFrequencyStat : _nGramsFrequencyStatsList) { nGramRank = nGramsFrequencyStat.getNGramRank(); /* * Collect only lower order NGram labels */ if (nGramRank < MAX_ORDER_NGRAM_STATS) { if (nGramsFrequencyStat.isCompound()) { nGramsStat[nGramRank].add( nGramsFrequencyStat.getNumNGramOccurrences(), nGramsFrequencyStat.isNNP(), nGramsFrequencyStat.getMaxNumTermOccurrences()); nGramsCounter++; } else { oneGramsStat[nGramRank].add( nGramsFrequencyStat.getNumNGramOccurrences(), nGramsFrequencyStat.isNNP(), -1); oneGramsCounter++; } } } CNGramsModel instance = CNGramsModel.getInstance(); /* * Normalize to compute the average values.. */ StringBuilder buf = new StringBuilder(); buf.append(CFileUtil.COMMENTS_FIRST_CHAR); buf.append(" nnpFudge:"); buf.append(instance.getTagEstimator()); buf.append(" compoundFudge:"); buf.append(instance.getMaxTermFreqEstimator()); buf.append("\n"); buf.append(CFileUtil.COMMENTS_FIRST_CHAR); buf.append("1-Gram frequency model parameters\n"); for (int k = 0; k < MAX_ORDER_NGRAM_STATS; k++) { if (oneGramsStat[k].compute(oneGramsCounter)) { buf.append(oneGramsStat[k].toString()); buf.append("\n"); } } buf.append(CEnv.ENTRIES_DELIM); buf.append("\n"); buf.append(CFileUtil.COMMENTS_FIRST_CHAR); buf.append("N-Gram frequency model parameters\n"); for (int k = 0; k < MAX_ORDER_NGRAM_STATS; k++) { if (nGramsStat[k].compute(nGramsCounter)) { buf.append(nGramsStat[k].toString()); buf.append("\n"); } } buf.append(CEnv.ENTRIES_DELIM); try { final String fileName = TRAINING_FILES + "_" + _fileIndex; CFileUtil.write(fileName, buf.toString()); } catch (IOException e) { CLogger.error(e.toString()); } }
/** * Main method for the command line application to compute the taxonomy model features using a * Bayesian approach. * * @param args */ public static void main(String[] args) { try { CEnv.init(); if (args != null && args.length > 0) { /* * Launch the training phase. */ CTaxonomyClassifier classifier = null; if (args[0].compareTo("-train") == 0) { NModelParams modelParams = new NModelParams(); modelParams.addCategories(); CTaxonomyModel.getInstance(modelParams); classifier = new CTaxonomyClassifier(1); classifier.train(); } /* * Launch the validation phase. */ else if (args[0].compareTo("-validate") == 0) { NModelParams modelParams = new NModelParams(); try { CTaxonomyModel.init(modelParams); classifier = new CTaxonomyClassifier(1); classifier.validate(VALIDATION_RANGE); int numSamples = classifier.getNumSamples(); CLogger.info( "Taxonomy validation done with " + numSamples + " samples", CLogger.TAXONOMY_TRAIN_TRACE); } catch (InitException e) { CLogger.error(e.toString()); } } /* * Launch the test phase. */ else if (args[0].compareTo("-test") == 0) { NModelParams modelParams = initModelParams(args[1], args[2]); try { CTaxonomyModel.init(modelParams); classifier = new CTaxonomyClassifier(1); classifier.test(TEST_RANGE); int numSamples = classifier.getNumSamples(); CLogger.info( "Taxonomy validation done with " + numSamples + " samples", CLogger.TAXONOMY_TRAIN_TRACE); } catch (InitException e) { CLogger.error(e.toString()); } } } else { CLogger.info("Command line: CTaxonomyTrainApp {-train,-validate,-all}"); } } catch (InitException e) { CLogger.error(e.toString()); } }