public String classifyText(String inputText) {
    TextParser textParser = new TextParser();
    ArrayList<String> tokens = textParser.tokenizeString(inputText, true);

    DatabaseConnector databaseConnector = new DatabaseConnector();
    Classifier classifier = new Classifier(databaseConnector);
    int classId = classifier.classifyDoc(tokens);
    String assignedClass = "N/A";
    if (classId != -1) {
      assignedClass = databaseConnector.getClassName(classId);
    }
    Log.i("Classifier : ", "I've finished classifying.");
    Log.i("Class is : ", assignedClass);

    databaseConnector.updateClassContents(classId, true);
    Log.i("Classifier : ", "I've finished updating class counts.");

    databaseConnector.updateTermDistribution(
        textParser.getAllTokens(inputText, true), classId, true);
    Log.i("Classifier : ", "I've finished updating term distribution.");

    databaseConnector.closeDBConnection();

    return assignedClass;
  }
Ejemplo n.º 2
0
 /**
  * This function calculates the gini coefficient of a given term. It is used to measure the global
  * goodness of a term. Gini coefficient of a term that doesn't satisfy the support factor
  * constraint is taken to be 0.
  *
  * @param term
  * @return
  */
 public double calculateGiniCoefficient(String term) {
   Map<Integer, TermDistributionDao> termDistributionDaos =
       databaseConnector.getAllTermDistribution(term, false);
   Map<Integer, TermDistributionDao> userDataTermDistributionDaos =
       databaseConnector.getAllTermDistribution(term, true);
   double giniCoefficient = 0.0;
   List<Double> chiSquareValues = new ArrayList<Double>();
   double chiSquareMean = 0;
   int totalNumberOfOccurences = 0;
   TermDistributionDao termDistributionDao = null, userDataTermDistributionDao = null;
   Set<Integer> classIdSet = new HashSet<Integer>();
   for (Integer classId : termDistributionDaos.keySet()) {
     classIdSet.add(classId);
   }
   for (Integer classId : userDataTermDistributionDaos.keySet()) {
     classIdSet.add(classId);
   }
   for (int classId : classIdSet) {
     termDistributionDao = termDistributionDaos.get(classId);
     userDataTermDistributionDao = userDataTermDistributionDaos.get(classId);
     if (termDistributionDao != null) {
       totalNumberOfOccurences += termDistributionDao.getA();
     }
     if (userDataTermDistributionDao != null) {
       totalNumberOfOccurences += userDataTermDistributionDao.getA();
     }
   }
   // If support factor constraint is not satisfied, I'm getting rid of the
   // term.
   if (totalNumberOfOccurences < SUPPORT_FACTOR) {
     return 0.0;
   }
   for (int classId = 0; classId < numberOfClasses; classId++) {
     int A = 0;
     if (termDistributionDaos.containsKey(classId)) {
       A += termDistributionDaos.get(classId).getA();
     }
     if (userDataTermDistributionDaos.containsKey(classId)) {
       A += userDataTermDistributionDaos.get(classId).getA();
     }
     double chiSquare = calculateChiSquare(A, totalNumberOfOccurences - A, classId);
     chiSquareValues.add(chiSquare);
     chiSquareMean += chiSquare;
   }
   chiSquareMean /= numberOfClasses;
   Collections.sort(chiSquareValues);
   giniCoefficient = 0;
   for (int i = 0; i < numberOfClasses; i++) {
     giniCoefficient += chiSquareValues.get(i) * (2 * (i + 1) - numberOfClasses - 1);
   }
   giniCoefficient /= (numberOfClasses * numberOfClasses * chiSquareMean);
   return giniCoefficient;
 }
 @Override
 public void run() {
   DatabaseConnector databaseConnector = new DatabaseConnector();
   ArrayList<ActivityDao> activityDaos = databaseConnector.getUnclassifiedActivities();
   for (ActivityDao activityDao : activityDaos) {
     String url = activityDao.getActivityInfo().trim();
     activityDao.setAssignedClass(classifyUrl(url));
   }
   databaseConnector.updateActivities(activityDaos);
   databaseConnector.closeDBConnection();
   stopService(new Intent(getBaseContext(), ClassifierService.class));
 }
Ejemplo n.º 4
0
 /**
  * This method takes in all the unique terms present in our DB, calculates the Gini Coefficient of
  * each term. If the term satisfies both the Gini Coefficient and Support factor requirements,
  * it's considered to be a feature.
  *
  * @return
  */
 public ArrayList<String> calculateFeaturesList() {
   ArrayList<String> termsList = databaseConnector.getTermsList(false);
   ArrayList<String> userDataTermsList = databaseConnector.getTermsList(true);
   for (String term : userDataTermsList) {
     if (!termsList.contains(term)) {
       termsList.add(term);
     }
   }
   ArrayList<String> featuresList = new ArrayList<String>();
   for (String term : termsList) {
     double giniCoefficient = calculateGiniCoefficient(term);
     if (giniCoefficient >= GINI_THRESHOLD) {
       featuresList.add(term);
     }
   }
   return featuresList;
 }
Ejemplo n.º 5
0
 /**
  * Simple method that calculates the gini values for each term and returns that mapping. Uses the
  * ValueComparator class to get the values in a descending order. Perfect for testing and
  * analysis.
  *
  * @return
  */
 public Map<String, Double> getGiniMapping() {
   ArrayList<String> termsList = databaseConnector.getTermsList(false);
   ArrayList<String> userDataTermsList = databaseConnector.getTermsList(true);
   for (String term : userDataTermsList) {
     if (!termsList.contains(term)) {
       termsList.add(term);
     }
   }
   Map<String, Double> termGiniMapping = new HashMap<String, Double>();
   ValueComparator valueComparator = new ValueComparator(termGiniMapping);
   TreeMap<String, Double> sortedTermGiniMapping = new TreeMap<String, Double>(valueComparator);
   for (String term : termsList) {
     double giniCoefficient = calculateGiniCoefficient(term);
     termGiniMapping.put(term, giniCoefficient);
   }
   sortedTermGiniMapping.putAll(termGiniMapping);
   return sortedTermGiniMapping;
 }
 @Override
 public void run() {
   DatabaseConnector databaseConnector = new DatabaseConnector();
   Classifier classifier = new Classifier(databaseConnector);
   classifier.recomputeFeatures();
   ArrayList<String> features = databaseConnector.getAllFeaturesList();
   FileWriter fileWriter;
   try {
     fileWriter =
         new FileWriter(
             Environment.getExternalStorageDirectory().getPath()
                 + "/MobileProfilerDatabase/features.txt");
     for (String feature : features) {
       fileWriter.write(feature + "\n");
     }
     fileWriter.close();
   } catch (IOException e) {
     Log.e("Error error : ", "FATAL ERROR");
     e.printStackTrace();
   }
   databaseConnector.closeDBConnection();
   stopService(new Intent(getBaseContext(), FeatureComputationService.class));
 }
Ejemplo n.º 7
0
  /**
   * The key part of this class. This function takes in a set of tokens, runs the classifier on them
   * and returns the classId to which they're classified. //TODO Fix a logic to be followed and
   * describe it over here.
   *
   * @param tokens
   * @return
   */
  public int classifyDoc(ArrayList<String> tokens) {
    int numOfMatchedFeatures = 0;
    ArrayList<Double> datasetClassifierProbabilities = new ArrayList<Double>();
    ArrayList<Double> userDataClassifierProbabilities = new ArrayList<Double>();
    Double temp = 0.0;
    for (int i = 0; i < numberOfClasses; i++) {
      temp = (1.0 * classContents.get(i)) /* / totalNumberOfDocs */;
      datasetClassifierProbabilities.add(temp);
      temp = (1.0 * userDataClassContents.get(i)) /* / totalNumberOfDocs */;
      userDataClassifierProbabilities.add(temp);
    }

    Map<String, Map<Integer, TermDistributionDao>> termDistributions =
        databaseConnector.getAllTokensDistribution(tokens, false);
    Map<String, Map<Integer, TermDistributionDao>> userDataTermDistributions =
        databaseConnector.getAllTokensDistribution(tokens, true);

    ArrayList<Double> productProbabilities = new ArrayList<Double>();
    for (int i = 0; i < numberOfClasses; i++) {
      productProbabilities.add(-1.0);
    }
    for (String token : tokens) {
      numOfMatchedFeatures++;
      for (int i = 0; i < numberOfClasses; i++) {
        int termDistA = 0;
        if ((termDistributions.containsKey(token))
            && (termDistributions.get(token).containsKey(i))) {
          termDistA = termDistributions.get(token).get(i).getA();
        }
        temp = 1000 * ((1.0 * (1 + termDistA)) / (classContents.get(i) + numberOfClasses));
        if (productProbabilities.get(i) < 0) {
          productProbabilities.set(i, temp);
        } else {
          productProbabilities.set(i, temp * productProbabilities.get(i));
        }
      }
    }
    for (int i = 0; i < numberOfClasses; i++) {
      temp = productProbabilities.get(i);
      if (temp < 0) {
        temp = 0.0;
      }
      datasetClassifierProbabilities.set(i, temp);
    }

    productProbabilities.clear();
    for (int i = 0; i < numberOfClasses; i++) {
      productProbabilities.add(-1.0);
    }
    for (String token : tokens) {
      numOfMatchedFeatures++;
      for (int i = 0; i < numberOfClasses; i++) {
        int termDistA = 0;
        if ((userDataTermDistributions.containsKey(token))
            && (userDataTermDistributions.get(token).containsKey(i))) {
          termDistA = userDataTermDistributions.get(token).get(i).getA();
        }
        temp = 1000 * ((1.0 * (1 + termDistA)) / (userDataClassContents.get(i) + numberOfClasses));
        if (productProbabilities.get(i) < 0) {
          productProbabilities.set(i, temp);
        } else {
          productProbabilities.set(i, temp * productProbabilities.get(i));
        }
      }
    }
    for (int i = 0; i < numberOfClasses; i++) {
      temp = productProbabilities.get(i);
      if (temp < 0) {
        temp = 0.0;
      }
      userDataClassifierProbabilities.set(i, temp);
    }

    if (numOfMatchedFeatures == 0) {
      return -1;
    }
    ArrayList<Double> combinedProbabilities = new ArrayList<Double>();
    for (int i = 0; i < numberOfClasses; i++) {
      double alpha = 1.0, beta = 0.0;
      double c1 = classContents.get(i);
      double c2 = userDataClassContents.get(i);
      if (c2 >= 5) {
        if (c2 >= c1) {
          alpha = 0.5;
          beta = 0.5;
        } else {
          beta = c1 / (c1 + c2);
          alpha = c2 / (c1 + c2);
        }
      }
      Double combinedProbability =
          (alpha * datasetClassifierProbabilities.get(i))
              + (beta * userDataClassifierProbabilities.get(i));
      combinedProbabilities.add(combinedProbability);
    }

    int maxIndex = 0;
    Double maximumValue = -1.0;
    for (int i = 0; i < numberOfClasses; i++) {
      if (combinedProbabilities.get(i) > maximumValue) {
        maximumValue = combinedProbabilities.get(i);
        maxIndex = i;
      }
    }
    return maxIndex;
  }
Ejemplo n.º 8
0
 public void closeDBConnection() {
   databaseConnector.closeDBConnection();
 }
Ejemplo n.º 9
0
 /** Flushes the 'features' table and adds the newly computed features. */
 public void recomputeFeatures() {
   databaseConnector.deleteFeatures();
   databaseConnector.insertFeatures(calculateFeaturesList());
 }
Ejemplo n.º 10
0
  /**
   * This is the initial classification method we used.
   *
   * @param tokens
   * @return
   */
  public int classifyDocClassic(ArrayList<String> tokens) {
    int numOfMatchedFeatures = 0;
    ArrayList<Double> probabilities = new ArrayList<Double>();
    Double temp = 0.0;
    for (int i = 0; i < numberOfClasses; i++) {
      temp = (1.0 * classContents.get(i)) /* / totalNumberOfDocs */;
      temp += (1.0 * userDataClassContents.get(i)) /* / totalNumberOfDocs */;
      probabilities.add(temp);
    }

    Map<String, Map<Integer, TermDistributionDao>> termDistributions =
        databaseConnector.getAllTokensDistribution(tokens, false);
    Map<String, Map<Integer, TermDistributionDao>> userDataTermDistributions =
        databaseConnector.getAllTokensDistribution(tokens, true);

    ArrayList<Double> productProbabilities = new ArrayList<Double>();
    for (int i = 0; i < numberOfClasses; i++) {
      productProbabilities.add(-1.0);
    }
    for (String token : tokens) {
      numOfMatchedFeatures++;
      for (int i = 0; i < numberOfClasses; i++) {
        int termDistA = 0;
        if ((termDistributions.containsKey(token))
            && (termDistributions.get(token).containsKey(i))) {
          termDistA = termDistributions.get(token).get(i).getA();
        }
        if ((userDataTermDistributions.containsKey(token))
            && (userDataTermDistributions.get(token).containsKey(i))) {
          termDistA += userDataTermDistributions.get(token).get(i).getA();
        }
        temp =
            1000
                * ((1.0 * (1 + termDistA))
                    / ((classContents.get(i) + userDataClassContents.get(i)) + numberOfClasses));
        if (productProbabilities.get(i) < 0) {
          productProbabilities.set(i, temp);
        } else {
          productProbabilities.set(i, temp * productProbabilities.get(i));
        }
      }
    }
    for (int i = 0; i < numberOfClasses; i++) {
      temp = productProbabilities.get(i);
      if (temp < 0) {
        temp = 0.0;
      }
      probabilities.set(i, temp);
    }

    if (numOfMatchedFeatures == 0) {
      return -1;
    }
    int maxIndex = 0;
    Double maximumValue = -1.0;
    for (int i = 0; i < numberOfClasses; i++) {
      if (probabilities.get(i) > maximumValue) {
        maximumValue = probabilities.get(i);
        maxIndex = i;
      }
    }
    return maxIndex;
  }