예제 #1
0
  /**
   * Execute the validation of the N-Gram frequency model against the Wikipedia reference database.
   *
   * @param startIndex index of the first record in the Wikipedia reference database
   * @param endIndex index of the last record in the Wikipedia reference database
   * @return Number of training sets created
   * @throws SQLException if Wikipedia database is unavailable
   */
  public int regress(float[] tagTestValues, float[] maxNumTermTestValues, List<String[]> fieldsList)
      throws SQLException {
    int numRecords = -1;
    NValidationSets validationSets = null;
    StringBuilder buf =
        new StringBuilder("tagEstimator,maxTermFreqEstimator,meanNNP,meanNotNNP,meanAll\n");

    for (float tagTestValue : tagTestValues) {
      for (float maxNumTermTestValue : maxNumTermTestValues) {

        CNGramsModel.getInstance().setTagEstimator(tagTestValue);
        CNGramsModel.getInstance().setMaxTermFreqEstimator(maxNumTermTestValue);

        validationSets = new NValidationSets();
        numRecords = executeRun(fieldsList, validationSets);
        buf.append(validationSets.toString());
      }
    }

    try {
      CTfIdfScore.ITfDiscriminant discriminant = CTfIdfScore.getInstance().getTfDiscrimant();
      final String fileName = REGRESSION_FILES + "_" + discriminant.getName();
      CFileUtil.write(fileName, buf.toString());
    } catch (IOException e) {
      CLogger.error(e.toString());
    }

    return numRecords;
  }
예제 #2
0
  public int cluster(float[] tagTestValues, float[] maxNumTermTestValues, int numClusters)
      throws SQLException {

    List<CDataPoint> allDataPointsList = new ArrayList<CDataPoint>();
    List<CDataPoint> oneGramDataPointsList = new ArrayList<CDataPoint>();
    List<CDataPoint> nGramDataPointsList = new ArrayList<CDataPoint>();

    int numRecords = 0;
    NClusteringSets clusteringSets = null;
    int dataIndex = 0;
    CDbpediaSql query = CDbpediaSql.getInstance();
    query.setQuery(new String[] {"label", "lgabstract"}, null);

    List<String[]> fieldsList = query.execute(_startIndex, _endIndex - _startIndex);

    for (float tagTestValue : tagTestValues) {
      for (float maxNumTermTestValue : maxNumTermTestValues) {

        CNGramsModel.getInstance().setTagEstimator(tagTestValue);
        CNGramsModel.getInstance().setMaxTermFreqEstimator(maxNumTermTestValue);

        clusteringSets = new NClusteringSets();
        numRecords += executeRun(fieldsList, clusteringSets);

        CDataPoint datapoint = null;

        dataIndex = allDataPointsList.size();
        List<double[]> valuesList = clusteringSets.getAllDatasetsList();
        for (double[] values : valuesList) {
          datapoint = new CDataPoint(values, dataIndex);
          allDataPointsList.add(datapoint);
        }

        dataIndex = oneGramDataPointsList.size();
        valuesList = clusteringSets.getOneGramDatasetsList();
        for (double[] values : valuesList) {
          datapoint = new CDataPoint(values, dataIndex);
          oneGramDataPointsList.add(datapoint);
        }

        dataIndex = nGramDataPointsList.size();
        valuesList = clusteringSets.getNGramDatasetsList();
        for (double[] values : valuesList) {
          datapoint = new CDataPoint(values, dataIndex++);
          nGramDataPointsList.add(datapoint);
        }
      }
    }

    /*
     * Aggregate the clustering results for all N-Grams, 1-Grams and
     * N-Grams with N >1
     */
    StringBuilder resultsBuf = new StringBuilder();
    CKMeansClustering kmeanClustering = null;
    if (allDataPointsList.size() > 0) {
      kmeanClustering = new CKMeansClustering(numClusters, 500, allDataPointsList);
      kmeanClustering.train();
      resultsBuf.append("All NGrams\n");
      resultsBuf.append(kmeanClustering.toString());
    }
    if (oneGramDataPointsList.size() > 0) {
      kmeanClustering = new CKMeansClustering(numClusters, 500, oneGramDataPointsList);
      kmeanClustering.train();
      resultsBuf.append("\n1-Grams only\n");
      resultsBuf.append(kmeanClustering.toString());
    }
    if (nGramDataPointsList.size() > 0) {
      kmeanClustering = new CKMeansClustering(numClusters, 500, nGramDataPointsList);
      kmeanClustering.train();
      resultsBuf.append("\nN-Grams only\n");
      resultsBuf.append(kmeanClustering.toString());
    }

    /*
     * Save the clustering results ...
     */
    try {
      CTfIdfScore.ITfDiscriminant discriminant = CTfIdfScore.getInstance().getTfDiscrimant();
      final String fileName =
          CLUSTER_FILES + "_" + discriminant.getName() + "_" + String.valueOf(numClusters);
      CFileUtil.write(fileName, resultsBuf.toString());
    } catch (IOException e) {
      CLogger.error(e.toString());
    }

    return numRecords;
  }
예제 #3
0
    /** Compute the key parameters of the NGramsOrder model. */
    private void compute() {

      int oneGramsCounter = 0;
      int nGramsCounter = 0;

      /*
       * initialize the statistics for NGram
       */
      NLabeledNGramsStats[] nGramsStat = new NLabeledNGramsStats[MAX_ORDER_NGRAM_STATS];
      NLabeled1GramsStats[] oneGramsStat = new NLabeled1GramsStats[MAX_ORDER_NGRAM_STATS];
      for (int k = 0; k < MAX_ORDER_NGRAM_STATS; k++) {
        nGramsStat[k] = new NLabeledNGramsStats(k);
        oneGramsStat[k] = new NLabeled1GramsStats(k);
      }

      int nGramRank = 0;
      for (CNGramsStats nGramsFrequencyStat : _nGramsFrequencyStatsList) {
        nGramRank = nGramsFrequencyStat.getNGramRank();

        /*
         * Collect only lower order NGram labels
         */
        if (nGramRank < MAX_ORDER_NGRAM_STATS) {
          if (nGramsFrequencyStat.isCompound()) {
            nGramsStat[nGramRank].add(
                nGramsFrequencyStat.getNumNGramOccurrences(),
                nGramsFrequencyStat.isNNP(),
                nGramsFrequencyStat.getMaxNumTermOccurrences());
            nGramsCounter++;
          } else {
            oneGramsStat[nGramRank].add(
                nGramsFrequencyStat.getNumNGramOccurrences(), nGramsFrequencyStat.isNNP(), -1);
            oneGramsCounter++;
          }
        }
      }

      CNGramsModel instance = CNGramsModel.getInstance();

      /*
       * Normalize to compute the average values..
       */
      StringBuilder buf = new StringBuilder();
      buf.append(CFileUtil.COMMENTS_FIRST_CHAR);
      buf.append(" nnpFudge:");
      buf.append(instance.getTagEstimator());
      buf.append(" compoundFudge:");
      buf.append(instance.getMaxTermFreqEstimator());
      buf.append("\n");

      buf.append(CFileUtil.COMMENTS_FIRST_CHAR);
      buf.append("1-Gram frequency model parameters\n");
      for (int k = 0; k < MAX_ORDER_NGRAM_STATS; k++) {
        if (oneGramsStat[k].compute(oneGramsCounter)) {
          buf.append(oneGramsStat[k].toString());
          buf.append("\n");
        }
      }
      buf.append(CEnv.ENTRIES_DELIM);
      buf.append("\n");
      buf.append(CFileUtil.COMMENTS_FIRST_CHAR);
      buf.append("N-Gram frequency model parameters\n");
      for (int k = 0; k < MAX_ORDER_NGRAM_STATS; k++) {
        if (nGramsStat[k].compute(nGramsCounter)) {
          buf.append(nGramsStat[k].toString());
          buf.append("\n");
        }
      }
      buf.append(CEnv.ENTRIES_DELIM);

      try {
        final String fileName = TRAINING_FILES + "_" + _fileIndex;
        CFileUtil.write(fileName, buf.toString());
      } catch (IOException e) {
        CLogger.error(e.toString());
      }
    }