public String classifyText(String inputText) { TextParser textParser = new TextParser(); ArrayList<String> tokens = textParser.tokenizeString(inputText, true); DatabaseConnector databaseConnector = new DatabaseConnector(); Classifier classifier = new Classifier(databaseConnector); int classId = classifier.classifyDoc(tokens); String assignedClass = "N/A"; if (classId != -1) { assignedClass = databaseConnector.getClassName(classId); } Log.i("Classifier : ", "I've finished classifying."); Log.i("Class is : ", assignedClass); databaseConnector.updateClassContents(classId, true); Log.i("Classifier : ", "I've finished updating class counts."); databaseConnector.updateTermDistribution( textParser.getAllTokens(inputText, true), classId, true); Log.i("Classifier : ", "I've finished updating term distribution."); databaseConnector.closeDBConnection(); return assignedClass; }
/** * This function calculates the gini coefficient of a given term. It is used to measure the global * goodness of a term. Gini coefficient of a term that doesn't satisfy the support factor * constraint is taken to be 0. * * @param term * @return */ public double calculateGiniCoefficient(String term) { Map<Integer, TermDistributionDao> termDistributionDaos = databaseConnector.getAllTermDistribution(term, false); Map<Integer, TermDistributionDao> userDataTermDistributionDaos = databaseConnector.getAllTermDistribution(term, true); double giniCoefficient = 0.0; List<Double> chiSquareValues = new ArrayList<Double>(); double chiSquareMean = 0; int totalNumberOfOccurences = 0; TermDistributionDao termDistributionDao = null, userDataTermDistributionDao = null; Set<Integer> classIdSet = new HashSet<Integer>(); for (Integer classId : termDistributionDaos.keySet()) { classIdSet.add(classId); } for (Integer classId : userDataTermDistributionDaos.keySet()) { classIdSet.add(classId); } for (int classId : classIdSet) { termDistributionDao = termDistributionDaos.get(classId); userDataTermDistributionDao = userDataTermDistributionDaos.get(classId); if (termDistributionDao != null) { totalNumberOfOccurences += termDistributionDao.getA(); } if (userDataTermDistributionDao != null) { totalNumberOfOccurences += userDataTermDistributionDao.getA(); } } // If support factor constraint is not satisfied, I'm getting rid of the // term. if (totalNumberOfOccurences < SUPPORT_FACTOR) { return 0.0; } for (int classId = 0; classId < numberOfClasses; classId++) { int A = 0; if (termDistributionDaos.containsKey(classId)) { A += termDistributionDaos.get(classId).getA(); } if (userDataTermDistributionDaos.containsKey(classId)) { A += userDataTermDistributionDaos.get(classId).getA(); } double chiSquare = calculateChiSquare(A, totalNumberOfOccurences - A, classId); chiSquareValues.add(chiSquare); chiSquareMean += chiSquare; } chiSquareMean /= numberOfClasses; Collections.sort(chiSquareValues); giniCoefficient = 0; for (int i = 0; i < numberOfClasses; i++) { giniCoefficient += chiSquareValues.get(i) * (2 * (i + 1) - numberOfClasses - 1); } giniCoefficient /= (numberOfClasses * numberOfClasses * chiSquareMean); return giniCoefficient; }
@Override public void run() { DatabaseConnector databaseConnector = new DatabaseConnector(); ArrayList<ActivityDao> activityDaos = databaseConnector.getUnclassifiedActivities(); for (ActivityDao activityDao : activityDaos) { String url = activityDao.getActivityInfo().trim(); activityDao.setAssignedClass(classifyUrl(url)); } databaseConnector.updateActivities(activityDaos); databaseConnector.closeDBConnection(); stopService(new Intent(getBaseContext(), ClassifierService.class)); }
/** * This method takes in all the unique terms present in our DB, calculates the Gini Coefficient of * each term. If the term satisfies both the Gini Coefficient and Support factor requirements, * it's considered to be a feature. * * @return */ public ArrayList<String> calculateFeaturesList() { ArrayList<String> termsList = databaseConnector.getTermsList(false); ArrayList<String> userDataTermsList = databaseConnector.getTermsList(true); for (String term : userDataTermsList) { if (!termsList.contains(term)) { termsList.add(term); } } ArrayList<String> featuresList = new ArrayList<String>(); for (String term : termsList) { double giniCoefficient = calculateGiniCoefficient(term); if (giniCoefficient >= GINI_THRESHOLD) { featuresList.add(term); } } return featuresList; }
/** * Simple method that calculates the gini values for each term and returns that mapping. Uses the * ValueComparator class to get the values in a descending order. Perfect for testing and * analysis. * * @return */ public Map<String, Double> getGiniMapping() { ArrayList<String> termsList = databaseConnector.getTermsList(false); ArrayList<String> userDataTermsList = databaseConnector.getTermsList(true); for (String term : userDataTermsList) { if (!termsList.contains(term)) { termsList.add(term); } } Map<String, Double> termGiniMapping = new HashMap<String, Double>(); ValueComparator valueComparator = new ValueComparator(termGiniMapping); TreeMap<String, Double> sortedTermGiniMapping = new TreeMap<String, Double>(valueComparator); for (String term : termsList) { double giniCoefficient = calculateGiniCoefficient(term); termGiniMapping.put(term, giniCoefficient); } sortedTermGiniMapping.putAll(termGiniMapping); return sortedTermGiniMapping; }
@Override public void run() { DatabaseConnector databaseConnector = new DatabaseConnector(); Classifier classifier = new Classifier(databaseConnector); classifier.recomputeFeatures(); ArrayList<String> features = databaseConnector.getAllFeaturesList(); FileWriter fileWriter; try { fileWriter = new FileWriter( Environment.getExternalStorageDirectory().getPath() + "/MobileProfilerDatabase/features.txt"); for (String feature : features) { fileWriter.write(feature + "\n"); } fileWriter.close(); } catch (IOException e) { Log.e("Error error : ", "FATAL ERROR"); e.printStackTrace(); } databaseConnector.closeDBConnection(); stopService(new Intent(getBaseContext(), FeatureComputationService.class)); }
/** * The key part of this class. This function takes in a set of tokens, runs the classifier on them * and returns the classId to which they're classified. //TODO Fix a logic to be followed and * describe it over here. * * @param tokens * @return */ public int classifyDoc(ArrayList<String> tokens) { int numOfMatchedFeatures = 0; ArrayList<Double> datasetClassifierProbabilities = new ArrayList<Double>(); ArrayList<Double> userDataClassifierProbabilities = new ArrayList<Double>(); Double temp = 0.0; for (int i = 0; i < numberOfClasses; i++) { temp = (1.0 * classContents.get(i)) /* / totalNumberOfDocs */; datasetClassifierProbabilities.add(temp); temp = (1.0 * userDataClassContents.get(i)) /* / totalNumberOfDocs */; userDataClassifierProbabilities.add(temp); } Map<String, Map<Integer, TermDistributionDao>> termDistributions = databaseConnector.getAllTokensDistribution(tokens, false); Map<String, Map<Integer, TermDistributionDao>> userDataTermDistributions = databaseConnector.getAllTokensDistribution(tokens, true); ArrayList<Double> productProbabilities = new ArrayList<Double>(); for (int i = 0; i < numberOfClasses; i++) { productProbabilities.add(-1.0); } for (String token : tokens) { numOfMatchedFeatures++; for (int i = 0; i < numberOfClasses; i++) { int termDistA = 0; if ((termDistributions.containsKey(token)) && (termDistributions.get(token).containsKey(i))) { termDistA = termDistributions.get(token).get(i).getA(); } temp = 1000 * ((1.0 * (1 + termDistA)) / (classContents.get(i) + numberOfClasses)); if (productProbabilities.get(i) < 0) { productProbabilities.set(i, temp); } else { productProbabilities.set(i, temp * productProbabilities.get(i)); } } } for (int i = 0; i < numberOfClasses; i++) { temp = productProbabilities.get(i); if (temp < 0) { temp = 0.0; } datasetClassifierProbabilities.set(i, temp); } productProbabilities.clear(); for (int i = 0; i < numberOfClasses; i++) { productProbabilities.add(-1.0); } for (String token : tokens) { numOfMatchedFeatures++; for (int i = 0; i < numberOfClasses; i++) { int termDistA = 0; if ((userDataTermDistributions.containsKey(token)) && (userDataTermDistributions.get(token).containsKey(i))) { termDistA = userDataTermDistributions.get(token).get(i).getA(); } temp = 1000 * ((1.0 * (1 + termDistA)) / (userDataClassContents.get(i) + numberOfClasses)); if (productProbabilities.get(i) < 0) { productProbabilities.set(i, temp); } else { productProbabilities.set(i, temp * productProbabilities.get(i)); } } } for (int i = 0; i < numberOfClasses; i++) { temp = productProbabilities.get(i); if (temp < 0) { temp = 0.0; } userDataClassifierProbabilities.set(i, temp); } if (numOfMatchedFeatures == 0) { return -1; } ArrayList<Double> combinedProbabilities = new ArrayList<Double>(); for (int i = 0; i < numberOfClasses; i++) { double alpha = 1.0, beta = 0.0; double c1 = classContents.get(i); double c2 = userDataClassContents.get(i); if (c2 >= 5) { if (c2 >= c1) { alpha = 0.5; beta = 0.5; } else { beta = c1 / (c1 + c2); alpha = c2 / (c1 + c2); } } Double combinedProbability = (alpha * datasetClassifierProbabilities.get(i)) + (beta * userDataClassifierProbabilities.get(i)); combinedProbabilities.add(combinedProbability); } int maxIndex = 0; Double maximumValue = -1.0; for (int i = 0; i < numberOfClasses; i++) { if (combinedProbabilities.get(i) > maximumValue) { maximumValue = combinedProbabilities.get(i); maxIndex = i; } } return maxIndex; }
public void closeDBConnection() { databaseConnector.closeDBConnection(); }
/** Flushes the 'features' table and adds the newly computed features. */ public void recomputeFeatures() { databaseConnector.deleteFeatures(); databaseConnector.insertFeatures(calculateFeaturesList()); }
/** * This is the initial classification method we used. * * @param tokens * @return */ public int classifyDocClassic(ArrayList<String> tokens) { int numOfMatchedFeatures = 0; ArrayList<Double> probabilities = new ArrayList<Double>(); Double temp = 0.0; for (int i = 0; i < numberOfClasses; i++) { temp = (1.0 * classContents.get(i)) /* / totalNumberOfDocs */; temp += (1.0 * userDataClassContents.get(i)) /* / totalNumberOfDocs */; probabilities.add(temp); } Map<String, Map<Integer, TermDistributionDao>> termDistributions = databaseConnector.getAllTokensDistribution(tokens, false); Map<String, Map<Integer, TermDistributionDao>> userDataTermDistributions = databaseConnector.getAllTokensDistribution(tokens, true); ArrayList<Double> productProbabilities = new ArrayList<Double>(); for (int i = 0; i < numberOfClasses; i++) { productProbabilities.add(-1.0); } for (String token : tokens) { numOfMatchedFeatures++; for (int i = 0; i < numberOfClasses; i++) { int termDistA = 0; if ((termDistributions.containsKey(token)) && (termDistributions.get(token).containsKey(i))) { termDistA = termDistributions.get(token).get(i).getA(); } if ((userDataTermDistributions.containsKey(token)) && (userDataTermDistributions.get(token).containsKey(i))) { termDistA += userDataTermDistributions.get(token).get(i).getA(); } temp = 1000 * ((1.0 * (1 + termDistA)) / ((classContents.get(i) + userDataClassContents.get(i)) + numberOfClasses)); if (productProbabilities.get(i) < 0) { productProbabilities.set(i, temp); } else { productProbabilities.set(i, temp * productProbabilities.get(i)); } } } for (int i = 0; i < numberOfClasses; i++) { temp = productProbabilities.get(i); if (temp < 0) { temp = 0.0; } probabilities.set(i, temp); } if (numOfMatchedFeatures == 0) { return -1; } int maxIndex = 0; Double maximumValue = -1.0; for (int i = 0; i < numberOfClasses; i++) { if (probabilities.get(i) > maximumValue) { maximumValue = probabilities.get(i); maxIndex = i; } } return maxIndex; }