예제 #1
0
  /**
   * The key part of this class. This function takes in a set of tokens, runs the classifier on them
   * and returns the classId to which they're classified. //TODO Fix a logic to be followed and
   * describe it over here.
   *
   * @param tokens
   * @return
   */
  public int classifyDoc(ArrayList<String> tokens) {
    int numOfMatchedFeatures = 0;
    ArrayList<Double> datasetClassifierProbabilities = new ArrayList<Double>();
    ArrayList<Double> userDataClassifierProbabilities = new ArrayList<Double>();
    Double temp = 0.0;
    for (int i = 0; i < numberOfClasses; i++) {
      temp = (1.0 * classContents.get(i)) /* / totalNumberOfDocs */;
      datasetClassifierProbabilities.add(temp);
      temp = (1.0 * userDataClassContents.get(i)) /* / totalNumberOfDocs */;
      userDataClassifierProbabilities.add(temp);
    }

    Map<String, Map<Integer, TermDistributionDao>> termDistributions =
        databaseConnector.getAllTokensDistribution(tokens, false);
    Map<String, Map<Integer, TermDistributionDao>> userDataTermDistributions =
        databaseConnector.getAllTokensDistribution(tokens, true);

    ArrayList<Double> productProbabilities = new ArrayList<Double>();
    for (int i = 0; i < numberOfClasses; i++) {
      productProbabilities.add(-1.0);
    }
    for (String token : tokens) {
      numOfMatchedFeatures++;
      for (int i = 0; i < numberOfClasses; i++) {
        int termDistA = 0;
        if ((termDistributions.containsKey(token))
            && (termDistributions.get(token).containsKey(i))) {
          termDistA = termDistributions.get(token).get(i).getA();
        }
        temp = 1000 * ((1.0 * (1 + termDistA)) / (classContents.get(i) + numberOfClasses));
        if (productProbabilities.get(i) < 0) {
          productProbabilities.set(i, temp);
        } else {
          productProbabilities.set(i, temp * productProbabilities.get(i));
        }
      }
    }
    for (int i = 0; i < numberOfClasses; i++) {
      temp = productProbabilities.get(i);
      if (temp < 0) {
        temp = 0.0;
      }
      datasetClassifierProbabilities.set(i, temp);
    }

    productProbabilities.clear();
    for (int i = 0; i < numberOfClasses; i++) {
      productProbabilities.add(-1.0);
    }
    for (String token : tokens) {
      numOfMatchedFeatures++;
      for (int i = 0; i < numberOfClasses; i++) {
        int termDistA = 0;
        if ((userDataTermDistributions.containsKey(token))
            && (userDataTermDistributions.get(token).containsKey(i))) {
          termDistA = userDataTermDistributions.get(token).get(i).getA();
        }
        temp = 1000 * ((1.0 * (1 + termDistA)) / (userDataClassContents.get(i) + numberOfClasses));
        if (productProbabilities.get(i) < 0) {
          productProbabilities.set(i, temp);
        } else {
          productProbabilities.set(i, temp * productProbabilities.get(i));
        }
      }
    }
    for (int i = 0; i < numberOfClasses; i++) {
      temp = productProbabilities.get(i);
      if (temp < 0) {
        temp = 0.0;
      }
      userDataClassifierProbabilities.set(i, temp);
    }

    if (numOfMatchedFeatures == 0) {
      return -1;
    }
    ArrayList<Double> combinedProbabilities = new ArrayList<Double>();
    for (int i = 0; i < numberOfClasses; i++) {
      double alpha = 1.0, beta = 0.0;
      double c1 = classContents.get(i);
      double c2 = userDataClassContents.get(i);
      if (c2 >= 5) {
        if (c2 >= c1) {
          alpha = 0.5;
          beta = 0.5;
        } else {
          beta = c1 / (c1 + c2);
          alpha = c2 / (c1 + c2);
        }
      }
      Double combinedProbability =
          (alpha * datasetClassifierProbabilities.get(i))
              + (beta * userDataClassifierProbabilities.get(i));
      combinedProbabilities.add(combinedProbability);
    }

    int maxIndex = 0;
    Double maximumValue = -1.0;
    for (int i = 0; i < numberOfClasses; i++) {
      if (combinedProbabilities.get(i) > maximumValue) {
        maximumValue = combinedProbabilities.get(i);
        maxIndex = i;
      }
    }
    return maxIndex;
  }
예제 #2
0
  /**
   * This is the initial classification method we used.
   *
   * @param tokens
   * @return
   */
  public int classifyDocClassic(ArrayList<String> tokens) {
    int numOfMatchedFeatures = 0;
    ArrayList<Double> probabilities = new ArrayList<Double>();
    Double temp = 0.0;
    for (int i = 0; i < numberOfClasses; i++) {
      temp = (1.0 * classContents.get(i)) /* / totalNumberOfDocs */;
      temp += (1.0 * userDataClassContents.get(i)) /* / totalNumberOfDocs */;
      probabilities.add(temp);
    }

    Map<String, Map<Integer, TermDistributionDao>> termDistributions =
        databaseConnector.getAllTokensDistribution(tokens, false);
    Map<String, Map<Integer, TermDistributionDao>> userDataTermDistributions =
        databaseConnector.getAllTokensDistribution(tokens, true);

    ArrayList<Double> productProbabilities = new ArrayList<Double>();
    for (int i = 0; i < numberOfClasses; i++) {
      productProbabilities.add(-1.0);
    }
    for (String token : tokens) {
      numOfMatchedFeatures++;
      for (int i = 0; i < numberOfClasses; i++) {
        int termDistA = 0;
        if ((termDistributions.containsKey(token))
            && (termDistributions.get(token).containsKey(i))) {
          termDistA = termDistributions.get(token).get(i).getA();
        }
        if ((userDataTermDistributions.containsKey(token))
            && (userDataTermDistributions.get(token).containsKey(i))) {
          termDistA += userDataTermDistributions.get(token).get(i).getA();
        }
        temp =
            1000
                * ((1.0 * (1 + termDistA))
                    / ((classContents.get(i) + userDataClassContents.get(i)) + numberOfClasses));
        if (productProbabilities.get(i) < 0) {
          productProbabilities.set(i, temp);
        } else {
          productProbabilities.set(i, temp * productProbabilities.get(i));
        }
      }
    }
    for (int i = 0; i < numberOfClasses; i++) {
      temp = productProbabilities.get(i);
      if (temp < 0) {
        temp = 0.0;
      }
      probabilities.set(i, temp);
    }

    if (numOfMatchedFeatures == 0) {
      return -1;
    }
    int maxIndex = 0;
    Double maximumValue = -1.0;
    for (int i = 0; i < numberOfClasses; i++) {
      if (probabilities.get(i) > maximumValue) {
        maximumValue = probabilities.get(i);
        maxIndex = i;
      }
    }
    return maxIndex;
  }