예제 #1
0
  private int executeRun(List<String[]> entriesList, ISets sets) throws SQLException {
    int counter = 0;
    int recordIndex = 0;

    for (String[] fields : entriesList) {
      if (sets.isValid(fields)) {
        try {
          CNGramsStats nGramsFrequencyStat = extractFragment(fields[1], fields[0]);

          /*
           * Adds this new observation to the regression or classifier.
           */
          if (nGramsFrequencyStat != null) {
            sets.add(nGramsFrequencyStat);
            counter++;
          }
        } catch (SemanticAnalysisException e) {
          CLogger.error(e.toString());
        }
      }
      recordIndex++;
    }
    sets.compute();
    return counter;
  }
예제 #2
0
  /**
   * Execute the validation of the N-Gram frequency model against the Wikipedia reference database.
   *
   * @param startIndex index of the first record in the Wikipedia reference database
   * @param endIndex index of the last record in the Wikipedia reference database
   * @return Number of training sets created
   * @throws SQLException if Wikipedia database is unavailable
   */
  public int regress(float[] tagTestValues, float[] maxNumTermTestValues, List<String[]> fieldsList)
      throws SQLException {
    int numRecords = -1;
    NValidationSets validationSets = null;
    StringBuilder buf =
        new StringBuilder("tagEstimator,maxTermFreqEstimator,meanNNP,meanNotNNP,meanAll\n");

    for (float tagTestValue : tagTestValues) {
      for (float maxNumTermTestValue : maxNumTermTestValues) {

        CNGramsModel.getInstance().setTagEstimator(tagTestValue);
        CNGramsModel.getInstance().setMaxTermFreqEstimator(maxNumTermTestValue);

        validationSets = new NValidationSets();
        numRecords = executeRun(fieldsList, validationSets);
        buf.append(validationSets.toString());
      }
    }

    try {
      CTfIdfScore.ITfDiscriminant discriminant = CTfIdfScore.getInstance().getTfDiscrimant();
      final String fileName = REGRESSION_FILES + "_" + discriminant.getName();
      CFileUtil.write(fileName, buf.toString());
    } catch (IOException e) {
      CLogger.error(e.toString());
    }

    return numRecords;
  }
예제 #3
0
  public int cluster(float[] tagTestValues, float[] maxNumTermTestValues, int numClusters)
      throws SQLException {

    List<CDataPoint> allDataPointsList = new ArrayList<CDataPoint>();
    List<CDataPoint> oneGramDataPointsList = new ArrayList<CDataPoint>();
    List<CDataPoint> nGramDataPointsList = new ArrayList<CDataPoint>();

    int numRecords = 0;
    NClusteringSets clusteringSets = null;
    int dataIndex = 0;
    CDbpediaSql query = CDbpediaSql.getInstance();
    query.setQuery(new String[] {"label", "lgabstract"}, null);

    List<String[]> fieldsList = query.execute(_startIndex, _endIndex - _startIndex);

    for (float tagTestValue : tagTestValues) {
      for (float maxNumTermTestValue : maxNumTermTestValues) {

        CNGramsModel.getInstance().setTagEstimator(tagTestValue);
        CNGramsModel.getInstance().setMaxTermFreqEstimator(maxNumTermTestValue);

        clusteringSets = new NClusteringSets();
        numRecords += executeRun(fieldsList, clusteringSets);

        CDataPoint datapoint = null;

        dataIndex = allDataPointsList.size();
        List<double[]> valuesList = clusteringSets.getAllDatasetsList();
        for (double[] values : valuesList) {
          datapoint = new CDataPoint(values, dataIndex);
          allDataPointsList.add(datapoint);
        }

        dataIndex = oneGramDataPointsList.size();
        valuesList = clusteringSets.getOneGramDatasetsList();
        for (double[] values : valuesList) {
          datapoint = new CDataPoint(values, dataIndex);
          oneGramDataPointsList.add(datapoint);
        }

        dataIndex = nGramDataPointsList.size();
        valuesList = clusteringSets.getNGramDatasetsList();
        for (double[] values : valuesList) {
          datapoint = new CDataPoint(values, dataIndex++);
          nGramDataPointsList.add(datapoint);
        }
      }
    }

    /*
     * Aggregate the clustering results for all N-Grams, 1-Grams and
     * N-Grams with N >1
     */
    StringBuilder resultsBuf = new StringBuilder();
    CKMeansClustering kmeanClustering = null;
    if (allDataPointsList.size() > 0) {
      kmeanClustering = new CKMeansClustering(numClusters, 500, allDataPointsList);
      kmeanClustering.train();
      resultsBuf.append("All NGrams\n");
      resultsBuf.append(kmeanClustering.toString());
    }
    if (oneGramDataPointsList.size() > 0) {
      kmeanClustering = new CKMeansClustering(numClusters, 500, oneGramDataPointsList);
      kmeanClustering.train();
      resultsBuf.append("\n1-Grams only\n");
      resultsBuf.append(kmeanClustering.toString());
    }
    if (nGramDataPointsList.size() > 0) {
      kmeanClustering = new CKMeansClustering(numClusters, 500, nGramDataPointsList);
      kmeanClustering.train();
      resultsBuf.append("\nN-Grams only\n");
      resultsBuf.append(kmeanClustering.toString());
    }

    /*
     * Save the clustering results ...
     */
    try {
      CTfIdfScore.ITfDiscriminant discriminant = CTfIdfScore.getInstance().getTfDiscrimant();
      final String fileName =
          CLUSTER_FILES + "_" + discriminant.getName() + "_" + String.valueOf(numClusters);
      CFileUtil.write(fileName, resultsBuf.toString());
    } catch (IOException e) {
      CLogger.error(e.toString());
    }

    return numRecords;
  }
예제 #4
0
    /** Compute the key parameters of the NGramsOrder model. */
    private void compute() {

      int oneGramsCounter = 0;
      int nGramsCounter = 0;

      /*
       * initialize the statistics for NGram
       */
      NLabeledNGramsStats[] nGramsStat = new NLabeledNGramsStats[MAX_ORDER_NGRAM_STATS];
      NLabeled1GramsStats[] oneGramsStat = new NLabeled1GramsStats[MAX_ORDER_NGRAM_STATS];
      for (int k = 0; k < MAX_ORDER_NGRAM_STATS; k++) {
        nGramsStat[k] = new NLabeledNGramsStats(k);
        oneGramsStat[k] = new NLabeled1GramsStats(k);
      }

      int nGramRank = 0;
      for (CNGramsStats nGramsFrequencyStat : _nGramsFrequencyStatsList) {
        nGramRank = nGramsFrequencyStat.getNGramRank();

        /*
         * Collect only lower order NGram labels
         */
        if (nGramRank < MAX_ORDER_NGRAM_STATS) {
          if (nGramsFrequencyStat.isCompound()) {
            nGramsStat[nGramRank].add(
                nGramsFrequencyStat.getNumNGramOccurrences(),
                nGramsFrequencyStat.isNNP(),
                nGramsFrequencyStat.getMaxNumTermOccurrences());
            nGramsCounter++;
          } else {
            oneGramsStat[nGramRank].add(
                nGramsFrequencyStat.getNumNGramOccurrences(), nGramsFrequencyStat.isNNP(), -1);
            oneGramsCounter++;
          }
        }
      }

      CNGramsModel instance = CNGramsModel.getInstance();

      /*
       * Normalize to compute the average values..
       */
      StringBuilder buf = new StringBuilder();
      buf.append(CFileUtil.COMMENTS_FIRST_CHAR);
      buf.append(" nnpFudge:");
      buf.append(instance.getTagEstimator());
      buf.append(" compoundFudge:");
      buf.append(instance.getMaxTermFreqEstimator());
      buf.append("\n");

      buf.append(CFileUtil.COMMENTS_FIRST_CHAR);
      buf.append("1-Gram frequency model parameters\n");
      for (int k = 0; k < MAX_ORDER_NGRAM_STATS; k++) {
        if (oneGramsStat[k].compute(oneGramsCounter)) {
          buf.append(oneGramsStat[k].toString());
          buf.append("\n");
        }
      }
      buf.append(CEnv.ENTRIES_DELIM);
      buf.append("\n");
      buf.append(CFileUtil.COMMENTS_FIRST_CHAR);
      buf.append("N-Gram frequency model parameters\n");
      for (int k = 0; k < MAX_ORDER_NGRAM_STATS; k++) {
        if (nGramsStat[k].compute(nGramsCounter)) {
          buf.append(nGramsStat[k].toString());
          buf.append("\n");
        }
      }
      buf.append(CEnv.ENTRIES_DELIM);

      try {
        final String fileName = TRAINING_FILES + "_" + _fileIndex;
        CFileUtil.write(fileName, buf.toString());
      } catch (IOException e) {
        CLogger.error(e.toString());
      }
    }
예제 #5
0
  /**
   * Main method for the command line application to compute the taxonomy model features using a
   * Bayesian approach.
   *
   * @param args
   */
  public static void main(String[] args) {
    try {
      CEnv.init();
      if (args != null && args.length > 0) {

        /*
         * Launch the training phase.
         */
        CTaxonomyClassifier classifier = null;
        if (args[0].compareTo("-train") == 0) {
          NModelParams modelParams = new NModelParams();
          modelParams.addCategories();

          CTaxonomyModel.getInstance(modelParams);

          classifier = new CTaxonomyClassifier(1);
          classifier.train();
        }

        /*
         * Launch the validation phase.
         */
        else if (args[0].compareTo("-validate") == 0) {
          NModelParams modelParams = new NModelParams();

          try {
            CTaxonomyModel.init(modelParams);
            classifier = new CTaxonomyClassifier(1);
            classifier.validate(VALIDATION_RANGE);
            int numSamples = classifier.getNumSamples();
            CLogger.info(
                "Taxonomy validation done with " + numSamples + " samples",
                CLogger.TAXONOMY_TRAIN_TRACE);
          } catch (InitException e) {
            CLogger.error(e.toString());
          }

        }

        /*
         * Launch the test phase.
         */
        else if (args[0].compareTo("-test") == 0) {
          NModelParams modelParams = initModelParams(args[1], args[2]);

          try {
            CTaxonomyModel.init(modelParams);
            classifier = new CTaxonomyClassifier(1);
            classifier.test(TEST_RANGE);
            int numSamples = classifier.getNumSamples();
            CLogger.info(
                "Taxonomy validation done with " + numSamples + " samples",
                CLogger.TAXONOMY_TRAIN_TRACE);
          } catch (InitException e) {
            CLogger.error(e.toString());
          }
        }
      } else {
        CLogger.info("Command line: CTaxonomyTrainApp {-train,-validate,-all}");
      }
    } catch (InitException e) {
      CLogger.error(e.toString());
    }
  }