Exemplo n.º 1
0
  /**
   * Execute the validation of the N-Gram frequency model against the Wikipedia reference database.
   *
   * @param startIndex index of the first record in the Wikipedia reference database
   * @param endIndex index of the last record in the Wikipedia reference database
   * @return Number of training sets created
   * @throws SQLException if Wikipedia database is unavailable
   */
  public int regress(float[] tagTestValues, float[] maxNumTermTestValues, List<String[]> fieldsList)
      throws SQLException {
    int numRecords = -1;
    NValidationSets validationSets = null;
    StringBuilder buf =
        new StringBuilder("tagEstimator,maxTermFreqEstimator,meanNNP,meanNotNNP,meanAll\n");

    for (float tagTestValue : tagTestValues) {
      for (float maxNumTermTestValue : maxNumTermTestValues) {

        CNGramsModel.getInstance().setTagEstimator(tagTestValue);
        CNGramsModel.getInstance().setMaxTermFreqEstimator(maxNumTermTestValue);

        validationSets = new NValidationSets();
        numRecords = executeRun(fieldsList, validationSets);
        buf.append(validationSets.toString());
      }
    }

    try {
      CTfIdfScore.ITfDiscriminant discriminant = CTfIdfScore.getInstance().getTfDiscrimant();
      final String fileName = REGRESSION_FILES + "_" + discriminant.getName();
      CFileUtil.write(fileName, buf.toString());
    } catch (IOException e) {
      CLogger.error(e.toString());
    }

    return numRecords;
  }
Exemplo n.º 2
0
  public int cluster(float[] tagTestValues, float[] maxNumTermTestValues, int numClusters)
      throws SQLException {

    List<CDataPoint> allDataPointsList = new ArrayList<CDataPoint>();
    List<CDataPoint> oneGramDataPointsList = new ArrayList<CDataPoint>();
    List<CDataPoint> nGramDataPointsList = new ArrayList<CDataPoint>();

    int numRecords = 0;
    NClusteringSets clusteringSets = null;
    int dataIndex = 0;
    CDbpediaSql query = CDbpediaSql.getInstance();
    query.setQuery(new String[] {"label", "lgabstract"}, null);

    List<String[]> fieldsList = query.execute(_startIndex, _endIndex - _startIndex);

    for (float tagTestValue : tagTestValues) {
      for (float maxNumTermTestValue : maxNumTermTestValues) {

        CNGramsModel.getInstance().setTagEstimator(tagTestValue);
        CNGramsModel.getInstance().setMaxTermFreqEstimator(maxNumTermTestValue);

        clusteringSets = new NClusteringSets();
        numRecords += executeRun(fieldsList, clusteringSets);

        CDataPoint datapoint = null;

        dataIndex = allDataPointsList.size();
        List<double[]> valuesList = clusteringSets.getAllDatasetsList();
        for (double[] values : valuesList) {
          datapoint = new CDataPoint(values, dataIndex);
          allDataPointsList.add(datapoint);
        }

        dataIndex = oneGramDataPointsList.size();
        valuesList = clusteringSets.getOneGramDatasetsList();
        for (double[] values : valuesList) {
          datapoint = new CDataPoint(values, dataIndex);
          oneGramDataPointsList.add(datapoint);
        }

        dataIndex = nGramDataPointsList.size();
        valuesList = clusteringSets.getNGramDatasetsList();
        for (double[] values : valuesList) {
          datapoint = new CDataPoint(values, dataIndex++);
          nGramDataPointsList.add(datapoint);
        }
      }
    }

    /*
     * Aggregate the clustering results for all N-Grams, 1-Grams and
     * N-Grams with N >1
     */
    StringBuilder resultsBuf = new StringBuilder();
    CKMeansClustering kmeanClustering = null;
    if (allDataPointsList.size() > 0) {
      kmeanClustering = new CKMeansClustering(numClusters, 500, allDataPointsList);
      kmeanClustering.train();
      resultsBuf.append("All NGrams\n");
      resultsBuf.append(kmeanClustering.toString());
    }
    if (oneGramDataPointsList.size() > 0) {
      kmeanClustering = new CKMeansClustering(numClusters, 500, oneGramDataPointsList);
      kmeanClustering.train();
      resultsBuf.append("\n1-Grams only\n");
      resultsBuf.append(kmeanClustering.toString());
    }
    if (nGramDataPointsList.size() > 0) {
      kmeanClustering = new CKMeansClustering(numClusters, 500, nGramDataPointsList);
      kmeanClustering.train();
      resultsBuf.append("\nN-Grams only\n");
      resultsBuf.append(kmeanClustering.toString());
    }

    /*
     * Save the clustering results ...
     */
    try {
      CTfIdfScore.ITfDiscriminant discriminant = CTfIdfScore.getInstance().getTfDiscrimant();
      final String fileName =
          CLUSTER_FILES + "_" + discriminant.getName() + "_" + String.valueOf(numClusters);
      CFileUtil.write(fileName, resultsBuf.toString());
    } catch (IOException e) {
      CLogger.error(e.toString());
    }

    return numRecords;
  }