/** * Execute the validation of the N-Gram frequency model against the Wikipedia reference database. * * @param startIndex index of the first record in the Wikipedia reference database * @param endIndex index of the last record in the Wikipedia reference database * @return Number of training sets created * @throws SQLException if Wikipedia database is unavailable */ public int regress(float[] tagTestValues, float[] maxNumTermTestValues, List<String[]> fieldsList) throws SQLException { int numRecords = -1; NValidationSets validationSets = null; StringBuilder buf = new StringBuilder("tagEstimator,maxTermFreqEstimator,meanNNP,meanNotNNP,meanAll\n"); for (float tagTestValue : tagTestValues) { for (float maxNumTermTestValue : maxNumTermTestValues) { CNGramsModel.getInstance().setTagEstimator(tagTestValue); CNGramsModel.getInstance().setMaxTermFreqEstimator(maxNumTermTestValue); validationSets = new NValidationSets(); numRecords = executeRun(fieldsList, validationSets); buf.append(validationSets.toString()); } } try { CTfIdfScore.ITfDiscriminant discriminant = CTfIdfScore.getInstance().getTfDiscrimant(); final String fileName = REGRESSION_FILES + "_" + discriminant.getName(); CFileUtil.write(fileName, buf.toString()); } catch (IOException e) { CLogger.error(e.toString()); } return numRecords; }
public int cluster(float[] tagTestValues, float[] maxNumTermTestValues, int numClusters) throws SQLException { List<CDataPoint> allDataPointsList = new ArrayList<CDataPoint>(); List<CDataPoint> oneGramDataPointsList = new ArrayList<CDataPoint>(); List<CDataPoint> nGramDataPointsList = new ArrayList<CDataPoint>(); int numRecords = 0; NClusteringSets clusteringSets = null; int dataIndex = 0; CDbpediaSql query = CDbpediaSql.getInstance(); query.setQuery(new String[] {"label", "lgabstract"}, null); List<String[]> fieldsList = query.execute(_startIndex, _endIndex - _startIndex); for (float tagTestValue : tagTestValues) { for (float maxNumTermTestValue : maxNumTermTestValues) { CNGramsModel.getInstance().setTagEstimator(tagTestValue); CNGramsModel.getInstance().setMaxTermFreqEstimator(maxNumTermTestValue); clusteringSets = new NClusteringSets(); numRecords += executeRun(fieldsList, clusteringSets); CDataPoint datapoint = null; dataIndex = allDataPointsList.size(); List<double[]> valuesList = clusteringSets.getAllDatasetsList(); for (double[] values : valuesList) { datapoint = new CDataPoint(values, dataIndex); allDataPointsList.add(datapoint); } dataIndex = oneGramDataPointsList.size(); valuesList = clusteringSets.getOneGramDatasetsList(); for (double[] values : valuesList) { datapoint = new CDataPoint(values, dataIndex); oneGramDataPointsList.add(datapoint); } dataIndex = nGramDataPointsList.size(); valuesList = clusteringSets.getNGramDatasetsList(); for (double[] values : valuesList) { datapoint = new CDataPoint(values, dataIndex++); nGramDataPointsList.add(datapoint); } } } /* * Aggregate the clustering results for all N-Grams, 1-Grams and * N-Grams with N >1 */ StringBuilder resultsBuf = new StringBuilder(); CKMeansClustering kmeanClustering = null; if (allDataPointsList.size() > 0) { kmeanClustering = new CKMeansClustering(numClusters, 500, allDataPointsList); kmeanClustering.train(); resultsBuf.append("All NGrams\n"); resultsBuf.append(kmeanClustering.toString()); } if (oneGramDataPointsList.size() > 0) { kmeanClustering = new CKMeansClustering(numClusters, 500, oneGramDataPointsList); kmeanClustering.train(); resultsBuf.append("\n1-Grams only\n"); resultsBuf.append(kmeanClustering.toString()); } if (nGramDataPointsList.size() > 0) { kmeanClustering = new CKMeansClustering(numClusters, 500, nGramDataPointsList); kmeanClustering.train(); resultsBuf.append("\nN-Grams only\n"); resultsBuf.append(kmeanClustering.toString()); } /* * Save the clustering results ... */ try { CTfIdfScore.ITfDiscriminant discriminant = CTfIdfScore.getInstance().getTfDiscrimant(); final String fileName = CLUSTER_FILES + "_" + discriminant.getName() + "_" + String.valueOf(numClusters); CFileUtil.write(fileName, resultsBuf.toString()); } catch (IOException e) { CLogger.error(e.toString()); } return numRecords; }