Exemplo n.º 1
0
  @Override
  public void trainMostSimilar(List<EnsembleSim> simList) {
    if (simList.isEmpty()) {
      throw new IllegalStateException("no examples to train on!");
    }
    mostSimilarInterpolator.trainMostSimilar(simList);

    // Remove things that have no observed metrics
    List<EnsembleSim> pruned = new ArrayList<EnsembleSim>();
    for (EnsembleSim es : simList) {
      if (es != null && es.getNumMetricsWithScore() > 0) {
        pruned.add(es);
      }
    }

    double[][] X = new double[pruned.size()][numMetrics * 2];
    double[] Y = new double[pruned.size()];
    for (int i = 0; i < pruned.size(); i++) {
      Y[i] = pruned.get(i).knownSim.similarity;
      EnsembleSim es = mostSimilarInterpolator.interpolate(pruned.get(i));
      for (int j = 0; j < numMetrics; j++) {
        X[i][2 * j] = es.getScores().get(j);
        X[i][2 * j + 1] = Math.log(es.getRanks().get(j) + 1);
      }
    }

    OLSMultipleLinearRegression regression = new OLSMultipleLinearRegression();
    regression.newSampleData(Y, X);

    mostSimilarCoefficients = new TDoubleArrayList(regression.estimateRegressionParameters());
    double pearson = Math.sqrt(regression.calculateRSquared());
    LOG.info("coefficients are " + mostSimilarCoefficients.toString());
    LOG.info("pearson for multiple regression is " + pearson);
  }
Exemplo n.º 2
0
  @Override
  public void trainSimilarity(List<EnsembleSim> simList) {
    if (simList.isEmpty()) {
      throw new IllegalArgumentException("no examples to train on!");
    }
    similarityInterpolator.trainSimilarity(simList);
    double[][] X = new double[simList.size()][numMetrics];
    double[] Y = new double[simList.size()];
    for (int i = 0; i < simList.size(); i++) {
      Y[i] = simList.get(i).knownSim.similarity;
      EnsembleSim es = similarityInterpolator.interpolate(simList.get(i));
      for (int j = 0; j < numMetrics; j++) {
        X[i][j] = es.getScores().get(j);
      }
    }
    OLSMultipleLinearRegression regression = new OLSMultipleLinearRegression();
    regression.newSampleData(Y, X);

    simlarityCoefficients = new TDoubleArrayList(regression.estimateRegressionParameters());
    double pearson = Math.sqrt(regression.calculateRSquared());
    LOG.info("coefficients are " + simlarityCoefficients.toString());
    LOG.info("pearson for multiple regression is " + pearson);
  }
Exemplo n.º 3
0
 @Override
 public SRResult predictSimilarity(List<SRResult> scores) {
   if (scores.size() + 1 != simlarityCoefficients.size()) {
     throw new IllegalStateException();
   }
   double weightedScore = simlarityCoefficients.get(0);
   for (int i = 0; i < scores.size(); i++) {
     double s = scores.get(i) == null ? Double.NaN : scores.get(i).getScore();
     if (Double.isNaN(s) || Double.isInfinite(s)) {
       s = similarityInterpolator.getInterpolatedScore(i);
     }
     weightedScore += (s * simlarityCoefficients.get(i + 1));
   }
   return new SRResult(weightedScore);
 }
Exemplo n.º 4
0
  @Override
  public SRResultList predictMostSimilar(
      List<SRResultList> scores, int maxResults, TIntSet validIds) {
    if (2 * scores.size() + 1 != mostSimilarCoefficients.size()) {
      throw new IllegalStateException();
    }
    TIntSet allIds = new TIntHashSet(); // ids returned by at least one metric
    for (SRResultList resultList : scores) {
      if (resultList != null) {
        for (SRResult result : resultList) {
          allIds.add(result.getId());
        }
      }
    }

    TIntDoubleHashMap scoreMap = new TIntDoubleHashMap();
    for (int id : allIds.toArray()) {
      scoreMap.put(id, mostSimilarCoefficients.get(0));
    }
    int i = 1;
    for (SRResultList resultList : scores) {
      TIntSet unknownIds = new TIntHashSet(allIds);
      double c1 = mostSimilarCoefficients.get(i); // score coeff
      double c2 = mostSimilarCoefficients.get(i + 1); // rank coefficient
      if (resultList != null) {
        for (int j = 0; j < resultList.numDocs(); j++) {
          int rank = j + 1;
          // expand or contract ranks proportionately
          if (validIds != null) {
            double k = 1.0 * numTrainingCandidateArticles / validIds.size();
            rank = (int) (rank * k);
          }
          SRResult result = resultList.get(j);
          unknownIds.remove(result.getId());
          double value = c1 * result.getScore() + c2 * Math.log(rank);
          if (debug) {
            System.err.format(
                "%s %d. %.3f (id=%d), computing %.3f * %.3f + %.3f * (log(%d) = %.3f)\n",
                "m" + i, j, value, result.getId(), c1, result.getScore(), c2, rank, Math.log(rank));
          }
          scoreMap.adjustValue(result.getId(), value);
        }
      }

      // interpolate scores for unknown ids
      double value =
          c1 * mostSimilarInterpolator.getInterpolatedScore(i / 2)
              + c2 * Math.log(mostSimilarInterpolator.getInterpolatedRank(i / 2));
      for (int id : unknownIds.toArray()) {
        scoreMap.adjustValue(id, value);
      }
      i += 2;
    }
    List<SRResult> resultList = new ArrayList<SRResult>();
    for (int id : scoreMap.keys()) {
      resultList.add(new SRResult(id, scoreMap.get(id)));
    }
    Collections.sort(resultList);
    Collections.reverse(resultList);
    int size = maxResults > resultList.size() ? resultList.size() : maxResults;
    SRResultList result = new SRResultList(size);
    for (i = 0; i < size; i++) {
      result.set(i, resultList.get(i));
    }
    return result;
  }