/** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { Instances isTrainingSet = createSet(4); Instance instance1 = createInstance(new double[] {1, 0.7, 0.1, 0.7}, "S1", isTrainingSet); Instance instance2 = createInstance(new double[] {0.1, 0.2, 1, 0.3}, "S2", isTrainingSet); Instance instance22 = createInstance(new double[] {0, 0, 0, 0}, "S3", isTrainingSet); isTrainingSet.add(instance1); isTrainingSet.add(instance2); isTrainingSet.add(instance22); Instances isTestingSet = createSet(4); Instance instance3 = createInstance(new double[] {1, 0.7, 0.1, 0.7}, "S1", isTrainingSet); Instance instance4 = createInstance(new double[] {0.1, 0.2, 1, 0.3}, "S2", isTrainingSet); isTestingSet.add(instance3); isTestingSet.add(instance4); // Create a naïve bayes classifier Classifier cModel = (Classifier) new BayesNet(); // M5P cModel.buildClassifier(isTrainingSet); // Test the model Evaluation eTest = new Evaluation(isTrainingSet); eTest.evaluateModel(cModel, isTestingSet); // Print the result à la Weka explorer: String strSummary = eTest.toSummaryString(); System.out.println(strSummary); // Get the likelihood of each classes // fDistribution[0] is the probability of being “positive” // fDistribution[1] is the probability of being “negative” double[] fDistribution = cModel.distributionForInstance(instance4); for (int i = 0; i < fDistribution.length; i++) { System.out.println(fDistribution[i]); } }
private static void writePredictedDistributions( Classifier c, Instances data, int idIndex, Writer out) throws Exception { // header out.write("id"); for (int i = 0; i < data.numClasses(); i++) { out.write(",\""); out.write(data.classAttribute().value(i).replaceAll("[\"\\\\]", "_")); out.write("\""); } out.write("\n"); // data for (int i = 0; i < data.numInstances(); i++) { final String id = data.instance(i).stringValue(idIndex); double[] distribution = c.distributionForInstance(data.instance(i)); // final String label = data.attribute(classIndex).value(); out.write(id); for (double probability : distribution) { out.write(","); out.write(String.valueOf(probability > 1e-5 ? (float) probability : 0f)); } out.write("\n"); } }
public int SelectRow_KLDivergenceMisclassified( Instances pool, Classifier myEstimator, int desiredAttr) { // for each instance with unbought desiredAttr and label = desiredLabel // measure KL-divergence (relative entropy between two prob distributions): // KL(P||Q) = sum_i p_i log (p_i/q_i) // withr respect to Q = Uniform, we have // KL(P||U) = sum_i p_i log(p_i) // choose (row) that is minimum (i.e. closest to uniform) int numInstances = pool.numInstances(); double[] KLDivs = new double[numInstances]; boolean[] isValidInstance = new boolean[numInstances]; boolean misclassified = false; double[] probs = null; Instance inst; for (int i = 0; i < numInstances; i++) { inst = pool.instance(i); try { if (inst.classValue() != myEstimator.classifyInstance(inst)) misclassified = true; else misclassified = false; } catch (Exception e1) { // TODO Auto-generated catch block e1.printStackTrace(); } if (inst.isMissing(desiredAttr) && misclassified) { try { probs = myEstimator.distributionForInstance(inst); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } for (int j = 0; j < probs.length; j++) KLDivs[i] += MyXLogX(probs[j]); isValidInstance[i] = true; } else { KLDivs[i] = Double.MAX_VALUE; isValidInstance[i] = false; } } double leastDivergence = KLDivs[Utils.minIndex(KLDivs)]; int numLeastDivs = 0; for (int i = 0; i < numInstances; i++) if (isValidInstance[i] && KLDivs[i] == leastDivergence) numLeastDivs++; int randomInstance = r.nextInt(numLeastDivs); int index = 0; for (int i = 0; i < numInstances; i++) { if (isValidInstance[i] && KLDivs[i] == leastDivergence) { if (index == randomInstance) return i; else index++; } } return -1; }
public void batchPredict() { // load all test set String modelFile = "data\\AcquireValueShopper\\decisionTable_bayes_trees.model".replace("\\", File.separator); String pathTest = "data/AcquireValueShopper/test_new.csv"; String pathPredict = "data/AcquireValueShopper/submission.csv"; Scanner scanner; String line = ""; String[] partsOfLine = null; String id = ""; PrintWriter output; Map<String, String> testSet = new HashMap<String, String>(); try { scanner = new Scanner(new File(pathTest)); while (scanner.hasNext()) { line = scanner.nextLine().trim(); partsOfLine = line.split(","); id = partsOfLine[0]; testSet.put(id, line); } scanner.close(); } catch (FileNotFoundException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } double[] returnProb; double prob = 0.0; // predict try { // load model Classifier classifier = (Classifier) SerializationHelper.read(modelFile); output = new PrintWriter(pathPredict); output.append("id,repeatProbability" + "\n"); Iterator<String> idIterator = testSet.keySet().iterator(); while (idIterator.hasNext()) { id = idIterator.next(); line = testSet.get(id); Instances instances = buildInstance(line); Instance instance = instances.instance(0); returnProb = classifier.distributionForInstance(instance); prob = returnProb[1]; // prob = classifier.classifyInstance(instance); output.append(id + "," + prob + "\n"); } output.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
int SelectRow_ErrorMargin(Instances pool, Classifier myEstimator, int desiredAttr) { // for each instance with unbought desiredAttr and label = desiredLabel // measure Prob(i,L(i)) the class probability of the true label, choose the one minimizing it. // i.e. the most erroneous instance int numInstances = pool.numInstances(); double[] classProb = new double[numInstances]; boolean[] isValidInstance = new boolean[numInstances]; double[] probs = null; Instance inst; for (int i = 0; i < numInstances; i++) { inst = pool.instance(i); if (inst.isMissing(desiredAttr)) { try { probs = myEstimator.distributionForInstance(inst); classProb[i] = probs[(int) inst.classValue()]; isValidInstance[i] = true; } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } else { classProb[i] = Double.POSITIVE_INFINITY; isValidInstance[i] = false; } } double leastCorrect = classProb[Utils.minIndex(classProb)]; int numLeastCorrect = 0; for (int i = 0; i < numInstances; i++) { if (isValidInstance[i] && classProb[i] == leastCorrect) numLeastCorrect++; } int randomInstance = r.nextInt(numLeastCorrect); int index = 0; for (int i = 0; i < numInstances; i++) { if (isValidInstance[i] && classProb[i] == leastCorrect) { if (index == randomInstance) return i; else index++; } } return -1; }
/** * Calculates the class membership probabilities for the given test instance * * @param instance the instance to be classified * @return predicted class probability distribution * @throws Exception if there is a problem generating the prediction */ public double[] distributionForInstance(Instance instance) throws Exception { // default model? if (m_ZeroR != null) { return m_ZeroR.distributionForInstance(instance); } // Definition of local variables double[] probs = new double[m_NumClasses]; double prob; double mutualInfoSum; // store instance's att values in an int array int[] attIndex = new int[m_NumAttributes]; for (int att = 0; att < m_NumAttributes; att++) { if (att == m_ClassIndex) attIndex[att] = -1; else attIndex[att] = m_StartAttIndex[att] + (int) instance.value(att); } // calculate probabilities for each possible class value for (int classVal = 0; classVal < m_NumClasses; classVal++) { probs[classVal] = 0; prob = 1; mutualInfoSum = 0.0; for (int parent = 0; parent < m_NumAttributes; parent++) { if (attIndex[parent] == -1) continue; prob = (m_ClassAttAttCounts[classVal][attIndex[parent]][attIndex[parent]] + 1.0 / (m_NumClasses * m_NumAttValues[parent])) / (m_NumInstances + 1.0); for (int son = 0; son < m_NumAttributes; son++) { if (attIndex[son] == -1 || son == parent) continue; prob *= (m_ClassAttAttCounts[classVal][attIndex[parent]][attIndex[son]] + 1.0 / m_NumAttValues[son]) / (m_ClassAttAttCounts[classVal][attIndex[parent]][attIndex[parent]] + 1.0); } mutualInfoSum += m_mutualInformation[parent]; probs[classVal] += m_mutualInformation[parent] * prob; } probs[classVal] /= mutualInfoSum; } if (!Double.isNaN(Utils.sum(probs))) Utils.normalize(probs); return probs; }
public int SelectRow_L2Norm( Instances pool, Classifier myEstimator, int desiredAttr, int desiredLabel) { // for each instance with unbought desiredAttr and label = desiredLabel // measure distance from uniform // choose (row) that is closest to uniform as your instance to buy from double leastDistance = Double.MAX_VALUE; int leastIndex = -1; Instance inst; int n = pool.numClasses(); double[] uniform; double[] probs; uniform = new double[n]; for (int i = 0; i < n; i++) uniform[i] = 1.0 / (double) n; for (int i = 0; i < pool.numInstances(); i++) { inst = pool.instance(i); // System.out.println("currentlabel="+(int)inst.classValue()+" // isMissing="+inst.isMissing(desiredAttr)); if ((int) inst.classValue() == desiredLabel && inst.isMissing(desiredAttr)) { // valid instance // measure the distance from uniform: // sqrt{ sum_i (a_i - b_i)^2 } probs = new double[n]; try { probs = myEstimator.distributionForInstance(inst); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } double distance = 0.0; for (int j = 0; j < n; j++) distance += (probs[j] - uniform[j]) * (probs[j] - uniform[j]); distance = Math.sqrt(distance); // System.out.println("current distance="+distance); if (distance < leastDistance) { leastDistance = distance; leastIndex = i; } } } return leastIndex; }
/** * Calculates the class membership probabilities for the given test instance. * * @param instance the instance to be classified * @return predicted class probability distribution * @throws Exception if instance could not be classified successfully */ public double[] distributionForInstance(Instance instance) throws Exception { double[] sums = new double[instance.numClasses()]; for (int i = -1; i < m_NumIterationsPerformed; i++) { double prob = 1, shrinkage = m_Shrinkage; if (i == -1) { prob = m_ZeroR.distributionForInstance(instance)[0]; shrinkage = 1.0; } else { prob = m_Classifiers[i].distributionForInstance(instance)[0]; // Make sure that probabilities are never 0 or 1 using ad-hoc smoothing prob = (m_SumOfWeights * prob + 1) / (m_SumOfWeights + 2); } sums[0] += shrinkage * 0.5 * (Math.log(prob) - Math.log(1 - prob)); } sums[1] = -sums[0]; return Utils.logs2probs(sums); }
/** * Calculates the class membership probabilities for the given test instance. * * @param instance the instance to be classified * @return predicted class probability distribution * @exception Exception if distribution can't be computed successfully */ public double[] distributionForInstance(Instance instance) throws Exception { if (instance.classAttribute().isNumeric()) { throw new UnsupportedClassTypeException("Decorate can't handle a numeric class!"); } double[] sums = new double[instance.numClasses()], newProbs; Classifier curr; for (int i = 0; i < m_Committee.size(); i++) { curr = (Classifier) m_Committee.get(i); newProbs = curr.distributionForInstance(instance); for (int j = 0; j < newProbs.length; j++) sums[j] += newProbs[j]; } if (Utils.eq(Utils.sum(sums), 0)) { return sums; } else { Utils.normalize(sums); return sums; } }
/** * Sets the weights for the next iteration. * * @param training the training instances * @throws Exception if something goes wrong */ protected void setWeights(Instances training, int iteration) throws Exception { for (Instance instance : training) { double reweight = 1; double prob = 1, shrinkage = m_Shrinkage; if (iteration == -1) { prob = m_ZeroR.distributionForInstance(instance)[0]; shrinkage = 1.0; } else { prob = m_Classifiers[iteration].distributionForInstance(instance)[0]; // Make sure that probabilities are never 0 or 1 using ad-hoc smoothing prob = (m_SumOfWeights * prob + 1) / (m_SumOfWeights + 2); } if (instance.classValue() == 1) { reweight = shrinkage * 0.5 * (Math.log(prob) - Math.log(1 - prob)); } else { reweight = shrinkage * 0.5 * (Math.log(1 - prob) - Math.log(prob)); } instance.setWeight(instance.weight() * Math.exp(reweight)); } }
@Override public double[] distributionForInstance(Instance inst) throws Exception { return m_model.distributionForInstance(inst); }
/** * Return the argmax on #distribution(Instance, double[]). * * @return argmax_{k in 0,1,...} p( y_j = k | x , y_pred ) */ public double classify(Instance x, double ypred[]) throws Exception { Instance x_ = transform(x, ypred); return Utils.maxIndex(h.distributionForInstance(x_)); }
/** * Same as #distribution(Instance, double[]), but the Instance is pre-transformed with ypred * inside. */ public double[] distributionT(Instance x_) throws Exception { return h.distributionForInstance(x_); }
/** * The distribution this this node, given input x. * * @return p( y_j = k | x , y_pred ) for k in {0,1} */ public double[] distribution(Instance x, double ypred[]) throws Exception { Instance x_ = transform(x, ypred); return h.distributionForInstance(x_); }
/** * Calculates the class membership probabilities for the given test instance. * * @param instance the instance to be classified * @return preedicted class probability distribution * @throws Exception if distribution can't be computed successfully */ public double[] distributionForInstance(Instance instance) throws Exception { // default model? if (m_ZeroR != null) { return m_ZeroR.distributionForInstance(instance); } if (m_Train.numInstances() == 0) { throw new Exception("No training instances!"); } m_NNSearch.addInstanceInfo(instance); int k = m_Train.numInstances(); if ((!m_UseAllK && (m_kNN < k)) /*&& !(m_WeightKernel==INVERSE || m_WeightKernel==GAUSS)*/) { k = m_kNN; } Instances neighbours = m_NNSearch.kNearestNeighbours(instance, k); double distances[] = m_NNSearch.getDistances(); if (m_Debug) { System.out.println("Test Instance: " + instance); System.out.println( "For " + k + " kept " + neighbours.numInstances() + " out of " + m_Train.numInstances() + " instances."); } // IF LinearNN has skipped so much that <k neighbours are remaining. if (k > distances.length) k = distances.length; if (m_Debug) { System.out.println("Instance Distances"); for (int i = 0; i < distances.length; i++) { System.out.println("" + distances[i]); } } // Determine the bandwidth double bandwidth = distances[k - 1]; // Check for bandwidth zero if (bandwidth <= 0) { // if the kth distance is zero than give all instances the same weight for (int i = 0; i < distances.length; i++) distances[i] = 1; } else { // Rescale the distances by the bandwidth for (int i = 0; i < distances.length; i++) distances[i] = distances[i] / bandwidth; } // Pass the distances through a weighting kernel for (int i = 0; i < distances.length; i++) { switch (m_WeightKernel) { case LINEAR: distances[i] = 1.0001 - distances[i]; break; case EPANECHNIKOV: distances[i] = 3 / 4D * (1.0001 - distances[i] * distances[i]); break; case TRICUBE: distances[i] = Math.pow((1.0001 - Math.pow(distances[i], 3)), 3); break; case CONSTANT: // System.err.println("using constant kernel"); distances[i] = 1; break; case INVERSE: distances[i] = 1.0 / (1.0 + distances[i]); break; case GAUSS: distances[i] = Math.exp(-distances[i] * distances[i]); break; } } if (m_Debug) { System.out.println("Instance Weights"); for (int i = 0; i < distances.length; i++) { System.out.println("" + distances[i]); } } // Set the weights on the training data double sumOfWeights = 0, newSumOfWeights = 0; for (int i = 0; i < distances.length; i++) { double weight = distances[i]; Instance inst = (Instance) neighbours.instance(i); sumOfWeights += inst.weight(); newSumOfWeights += inst.weight() * weight; inst.setWeight(inst.weight() * weight); // weightedTrain.add(newInst); } // Rescale weights for (int i = 0; i < neighbours.numInstances(); i++) { Instance inst = neighbours.instance(i); inst.setWeight(inst.weight() * sumOfWeights / newSumOfWeights); } // Create a weighted classifier m_Classifier.buildClassifier(neighbours); if (m_Debug) { System.out.println("Classifying test instance: " + instance); System.out.println("Built base classifier:\n" + m_Classifier.toString()); } // Return the classifier's predictions return m_Classifier.distributionForInstance(instance); }
private double[] calculateRegionProbs(int j, int i) throws Exception { double[] sumOfProbsForRegion = new double[m_trainingData.classAttribute().numValues()]; for (int u = 0; u < m_numOfSamplesPerRegion; u++) { double[] sumOfProbsForLocation = new double[m_trainingData.classAttribute().numValues()]; m_weightingAttsValues[m_xAttribute] = getRandomX(j); m_weightingAttsValues[m_yAttribute] = getRandomY(m_panelHeight - i - 1); m_dataGenerator.setWeightingValues(m_weightingAttsValues); double[] weights = m_dataGenerator.getWeights(); double sumOfWeights = Utils.sum(weights); int[] indices = Utils.sort(weights); // Prune 1% of weight mass int[] newIndices = new int[indices.length]; double sumSoFar = 0; double criticalMass = 0.99 * sumOfWeights; int index = weights.length - 1; int counter = 0; for (int z = weights.length - 1; z >= 0; z--) { newIndices[index--] = indices[z]; sumSoFar += weights[indices[z]]; counter++; if (sumSoFar > criticalMass) { break; } } indices = new int[counter]; System.arraycopy(newIndices, index + 1, indices, 0, counter); for (int z = 0; z < m_numOfSamplesPerGenerator; z++) { m_dataGenerator.setWeightingValues(m_weightingAttsValues); double[][] values = m_dataGenerator.generateInstances(indices); for (int q = 0; q < values.length; q++) { if (values[q] != null) { System.arraycopy(values[q], 0, m_vals, 0, m_vals.length); m_vals[m_xAttribute] = m_weightingAttsValues[m_xAttribute]; m_vals[m_yAttribute] = m_weightingAttsValues[m_yAttribute]; // classify the instance m_dist = m_classifier.distributionForInstance(m_predInst); for (int k = 0; k < sumOfProbsForLocation.length; k++) { sumOfProbsForLocation[k] += (m_dist[k] * weights[q]); } } } } for (int k = 0; k < sumOfProbsForRegion.length; k++) { sumOfProbsForRegion[k] += (sumOfProbsForLocation[k] * sumOfWeights); } } // average Utils.normalize(sumOfProbsForRegion); // cache double[] tempDist = new double[sumOfProbsForRegion.length]; System.arraycopy(sumOfProbsForRegion, 0, tempDist, 0, sumOfProbsForRegion.length); return tempDist; }
/** * Calculates the class membership probabilities for the given test instance. * * @param instance the instance to be classified * @return predicted class probability distribution * @throws Exception if class is numeric */ public double[] distributionForInstance(Instance instance) throws Exception { if (m_GroovyObject != null) return m_GroovyObject.distributionForInstance(instance); else return new double[instance.numClasses()]; }