/** * Builds a regression model for the given data. * * @param data the training data to be used for generating the linear regression function * @throws Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data) throws Exception { if (!m_checksTurnedOff) { // can classifier handle the data? getCapabilities().testWithFail(data); // remove instances with missing class data = new Instances(data); data.deleteWithMissingClass(); } // Preprocess instances if (!m_checksTurnedOff) { m_TransformFilter = new NominalToBinary(); m_TransformFilter.setInputFormat(data); data = Filter.useFilter(data, m_TransformFilter); m_MissingFilter = new ReplaceMissingValues(); m_MissingFilter.setInputFormat(data); data = Filter.useFilter(data, m_MissingFilter); data.deleteWithMissingClass(); } else { m_TransformFilter = null; m_MissingFilter = null; } m_ClassIndex = data.classIndex(); m_TransformedData = data; // Turn all attributes on for a start m_SelectedAttributes = new boolean[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { if (i != m_ClassIndex) { m_SelectedAttributes[i] = true; } } m_Coefficients = null; // Compute means and standard deviations m_Means = new double[data.numAttributes()]; m_StdDevs = new double[data.numAttributes()]; for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { m_Means[j] = data.meanOrMode(j); m_StdDevs[j] = Math.sqrt(data.variance(j)); if (m_StdDevs[j] == 0) { m_SelectedAttributes[j] = false; } } } m_ClassStdDev = Math.sqrt(data.variance(m_TransformedData.classIndex())); m_ClassMean = data.meanOrMode(m_TransformedData.classIndex()); // Perform the regression findBestModel(); // Save memory m_TransformedData = new Instances(data, 0); }
/** * Compute and store statistics required for generating artificial data. * * @param data training instances * @exception Exception if statistics could not be calculated successfully */ protected void computeStats(Instances data) throws Exception { int numAttributes = data.numAttributes(); m_AttributeStats = new Vector(numAttributes); // use to map attributes to their stats for (int j = 0; j < numAttributes; j++) { if (data.attribute(j).isNominal()) { // Compute the probability of occurence of each distinct value int[] nomCounts = (data.attributeStats(j)).nominalCounts; double[] counts = new double[nomCounts.length]; if (counts.length < 2) throw new Exception("Nominal attribute has less than two distinct values!"); // Perform Laplace smoothing for (int i = 0; i < counts.length; i++) counts[i] = nomCounts[i] + 1; Utils.normalize(counts); double[] stats = new double[counts.length - 1]; stats[0] = counts[0]; // Calculate cumulative probabilities for (int i = 1; i < stats.length; i++) stats[i] = stats[i - 1] + counts[i]; m_AttributeStats.add(j, stats); } else if (data.attribute(j).isNumeric()) { // Get mean and standard deviation from the training data double[] stats = new double[2]; stats[0] = data.meanOrMode(j); stats[1] = Math.sqrt(data.variance(j)); m_AttributeStats.add(j, stats); } else System.err.println("Decorate can only handle numeric and nominal values."); } }
/** * @param ex the given test exemplar * @return the classification * @throws Exception if the exemplar could not be classified successfully */ public double classifyInstance(Instance ex) throws Exception { // Instance ex = new Exemplar(e); Instances exi = ex.relationalValue(1); double[] n = new double[m_Dimension]; double[] xBar = new double[m_Dimension]; for (int i = 0; i < exi.numAttributes(); i++) xBar[i] = exi.meanOrMode(i); for (int w = 0, t = 0; w < m_Dimension; w++, t++) { // if((t==m_ClassIndex) || (t==m_IdIndex)) // t++; for (int u = 0; u < exi.numInstances(); u++) if (!exi.instance(u).isMissing(t)) n[w] += exi.instance(u).weight(); } double logOdds = likelihoodRatio(n, xBar); return (logOdds > m_Cutoff) ? 1 : 0; }
/** * Calculate metric value * * @param mlData Multi-label dataset to which calculate the metric * @return Value of the metric */ public double calculate(MultiLabelInstances mlData) { Instances instances = mlData.getDataSet(); int nInstances = mlData.getNumInstances(); double avg; double var2; double var4; double val; int nNumeric = 0; double mean = 0; Set<Attribute> attributesSet = mlData.getFeatureAttributes(); for (Attribute att : attributesSet) { if (att.isNumeric()) { nNumeric++; avg = instances.meanOrMode(att); var2 = 0; var4 = 0; for (Instance inst : instances) { val = inst.value(att); var2 += Math.pow(val - avg, 2); var4 += Math.pow(val - avg, 4); } double kurtosis = (nInstances * var4 / Math.pow(var2, 2)) - 3; double sampleKurtosis = (kurtosis * (nInstances + 1) + 6) * (nInstances - 1) / ((nInstances - 2) * (nInstances - 3)); mean += sampleKurtosis; } } if (nNumeric > 0) { mean = mean / nNumeric; } else { mean = Double.NaN; } this.value = mean; return value; }
/** * Signify that this batch of input to the filter is finished. If the filter requires all * instances prior to filtering, output() may now be called to retrieve the filtered instances. * * @return true if there are instances pending output * @throws IllegalStateException if no input structure has been defined */ public boolean batchFinished() { if (getInputFormat() == null) throw new IllegalStateException("No input instance format defined"); if (m_Means == null) { Instances input = getInputFormat(); m_Means = new double[input.numAttributes()]; for (int i = 0; i < input.numAttributes(); i++) { if (input.attribute(i).isNumeric() && (input.classIndex() != i)) { m_Means[i] = input.meanOrMode(i); } } // Convert pending input instances for (int i = 0; i < input.numInstances(); i++) convertInstance(input.instance(i)); } // Free memory flushInput(); m_NewBatch = true; return (numPendingOutput() != 0); }
/** * Computes the distribution for a given exemplar * * @param ex the exemplar for which distribution is computed * @return the distribution * @throws Exception if the distribution can't be computed successfully */ public double[] distributionForInstance(Instance ex) throws Exception { double[] distribution = new double[2]; Instances exi = ex.relationalValue(1); double[] n = new double[m_Dimension]; double[] xBar = new double[m_Dimension]; for (int i = 0; i < exi.numAttributes(); i++) xBar[i] = exi.meanOrMode(i); for (int w = 0, t = 0; w < m_Dimension; w++, t++) { for (int u = 0; u < exi.numInstances(); u++) if (!exi.instance(u).isMissing(t)) n[w] += exi.instance(u).weight(); } double logOdds = likelihoodRatio(n, xBar); // returned logOdds value has been divided by m_Dimension to avoid // Math.exp(logOdds) getting too large or too small, // that may result in two fixed distribution value (1 or 0). distribution[0] = 1 / (1 + Math.exp(logOdds)); // Prob. for class 0 (negative) distribution[1] = 1 - distribution[0]; return distribution; }
/** * @param exs the training exemplars * @throws Exception if the model cannot be built properly */ public void buildClassifier(Instances exs) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(exs); // remove instances with missing class exs = new Instances(exs); exs.deleteWithMissingClass(); int numegs = exs.numInstances(); m_Dimension = exs.attribute(1).relation().numAttributes(); Instances pos = new Instances(exs, 0), neg = new Instances(exs, 0); for (int u = 0; u < numegs; u++) { Instance example = exs.instance(u); if (example.classValue() == 1) pos.add(example); else neg.add(example); } int pnum = pos.numInstances(), nnum = neg.numInstances(); m_MeanP = new double[pnum][m_Dimension]; m_VarianceP = new double[pnum][m_Dimension]; m_SumP = new double[pnum][m_Dimension]; m_MeanN = new double[nnum][m_Dimension]; m_VarianceN = new double[nnum][m_Dimension]; m_SumN = new double[nnum][m_Dimension]; m_ParamsP = new double[4 * m_Dimension]; m_ParamsN = new double[4 * m_Dimension]; // Estimation of the parameters: as the start value for search double[] pSumVal = new double[m_Dimension], // for m nSumVal = new double[m_Dimension]; double[] maxVarsP = new double[m_Dimension], // for a maxVarsN = new double[m_Dimension]; // Mean of sample variances: for b, b=a/E(\sigma^2)+2 double[] varMeanP = new double[m_Dimension], varMeanN = new double[m_Dimension]; // Variances of sample means: for w, w=E[var(\mu)]/E[\sigma^2] double[] meanVarP = new double[m_Dimension], meanVarN = new double[m_Dimension]; // number of exemplars without all values missing double[] numExsP = new double[m_Dimension], numExsN = new double[m_Dimension]; // Extract metadata fro both positive and negative bags for (int v = 0; v < pnum; v++) { /*Exemplar px = pos.exemplar(v); m_MeanP[v] = px.meanOrMode(); m_VarianceP[v] = px.variance(); Instances pxi = px.getInstances(); */ Instances pxi = pos.instance(v).relationalValue(1); for (int k = 0; k < pxi.numAttributes(); k++) { m_MeanP[v][k] = pxi.meanOrMode(k); m_VarianceP[v][k] = pxi.variance(k); } for (int w = 0, t = 0; w < m_Dimension; w++, t++) { // if((t==m_ClassIndex) || (t==m_IdIndex)) // t++; if (!Double.isNaN(m_MeanP[v][w])) { for (int u = 0; u < pxi.numInstances(); u++) { Instance ins = pxi.instance(u); if (!ins.isMissing(t)) m_SumP[v][w] += ins.weight(); } numExsP[w]++; pSumVal[w] += m_MeanP[v][w]; meanVarP[w] += m_MeanP[v][w] * m_MeanP[v][w]; if (maxVarsP[w] < m_VarianceP[v][w]) maxVarsP[w] = m_VarianceP[v][w]; varMeanP[w] += m_VarianceP[v][w]; m_VarianceP[v][w] *= (m_SumP[v][w] - 1.0); if (m_VarianceP[v][w] < 0.0) m_VarianceP[v][w] = 0.0; } } } for (int v = 0; v < nnum; v++) { /*Exemplar nx = neg.exemplar(v); m_MeanN[v] = nx.meanOrMode(); m_VarianceN[v] = nx.variance(); Instances nxi = nx.getInstances(); */ Instances nxi = neg.instance(v).relationalValue(1); for (int k = 0; k < nxi.numAttributes(); k++) { m_MeanN[v][k] = nxi.meanOrMode(k); m_VarianceN[v][k] = nxi.variance(k); } for (int w = 0, t = 0; w < m_Dimension; w++, t++) { // if((t==m_ClassIndex) || (t==m_IdIndex)) // t++; if (!Double.isNaN(m_MeanN[v][w])) { for (int u = 0; u < nxi.numInstances(); u++) if (!nxi.instance(u).isMissing(t)) m_SumN[v][w] += nxi.instance(u).weight(); numExsN[w]++; nSumVal[w] += m_MeanN[v][w]; meanVarN[w] += m_MeanN[v][w] * m_MeanN[v][w]; if (maxVarsN[w] < m_VarianceN[v][w]) maxVarsN[w] = m_VarianceN[v][w]; varMeanN[w] += m_VarianceN[v][w]; m_VarianceN[v][w] *= (m_SumN[v][w] - 1.0); if (m_VarianceN[v][w] < 0.0) m_VarianceN[v][w] = 0.0; } } } for (int w = 0; w < m_Dimension; w++) { pSumVal[w] /= numExsP[w]; nSumVal[w] /= numExsN[w]; if (numExsP[w] > 1) meanVarP[w] = meanVarP[w] / (numExsP[w] - 1.0) - pSumVal[w] * numExsP[w] / (numExsP[w] - 1.0); if (numExsN[w] > 1) meanVarN[w] = meanVarN[w] / (numExsN[w] - 1.0) - nSumVal[w] * numExsN[w] / (numExsN[w] - 1.0); varMeanP[w] /= numExsP[w]; varMeanN[w] /= numExsN[w]; } // Bounds and parameter values for each run double[][] bounds = new double[2][4]; double[] pThisParam = new double[4], nThisParam = new double[4]; // Initial values for parameters double a, b, w, m; // Optimize for one dimension for (int x = 0; x < m_Dimension; x++) { if (getDebug()) System.err.println("\n\n!!!!!!!!!!!!!!!!!!!!!!???Dimension #" + x); // Positive examplars: first run a = (maxVarsP[x] > ZERO) ? maxVarsP[x] : 1.0; if (varMeanP[x] <= ZERO) varMeanP[x] = ZERO; // modified by LinDong (09/2005) b = a / varMeanP[x] + 2.0; // a/(b-2) = E(\sigma^2) w = meanVarP[x] / varMeanP[x]; // E[var(\mu)] = w*E[\sigma^2] if (w <= ZERO) w = 1.0; m = pSumVal[x]; pThisParam[0] = a; // a pThisParam[1] = b; // b pThisParam[2] = w; // w pThisParam[3] = m; // m // Negative examplars: first run a = (maxVarsN[x] > ZERO) ? maxVarsN[x] : 1.0; if (varMeanN[x] <= ZERO) varMeanN[x] = ZERO; // modified by LinDong (09/2005) b = a / varMeanN[x] + 2.0; // a/(b-2) = E(\sigma^2) w = meanVarN[x] / varMeanN[x]; // E[var(\mu)] = w*E[\sigma^2] if (w <= ZERO) w = 1.0; m = nSumVal[x]; nThisParam[0] = a; // a nThisParam[1] = b; // b nThisParam[2] = w; // w nThisParam[3] = m; // m // Bound constraints bounds[0][0] = ZERO; // a > 0 bounds[0][1] = 2.0 + ZERO; // b > 2 bounds[0][2] = ZERO; // w > 0 bounds[0][3] = Double.NaN; for (int t = 0; t < 4; t++) { bounds[1][t] = Double.NaN; m_ParamsP[4 * x + t] = pThisParam[t]; m_ParamsN[4 * x + t] = nThisParam[t]; } double pminVal = Double.MAX_VALUE, nminVal = Double.MAX_VALUE; Random whichEx = new Random(m_Seed); TLD_Optm pOp = null, nOp = null; boolean isRunValid = true; double[] sumP = new double[pnum], meanP = new double[pnum], varP = new double[pnum]; double[] sumN = new double[nnum], meanN = new double[nnum], varN = new double[nnum]; // One dimension for (int p = 0; p < pnum; p++) { sumP[p] = m_SumP[p][x]; meanP[p] = m_MeanP[p][x]; varP[p] = m_VarianceP[p][x]; } for (int q = 0; q < nnum; q++) { sumN[q] = m_SumN[q][x]; meanN[q] = m_MeanN[q][x]; varN[q] = m_VarianceN[q][x]; } for (int y = 0; y < m_Run; ) { if (getDebug()) System.err.println("\n\n!!!!!!!!!!!!!!!!!!!!!!???Run #" + y); double thisMin; if (getDebug()) System.err.println("\nPositive exemplars"); pOp = new TLD_Optm(); pOp.setNum(sumP); pOp.setSSquare(varP); pOp.setXBar(meanP); pThisParam = pOp.findArgmin(pThisParam, bounds); while (pThisParam == null) { pThisParam = pOp.getVarbValues(); if (getDebug()) System.err.println("!!! 200 iterations finished, not enough!"); pThisParam = pOp.findArgmin(pThisParam, bounds); } thisMin = pOp.getMinFunction(); if (!Double.isNaN(thisMin) && (thisMin < pminVal)) { pminVal = thisMin; for (int z = 0; z < 4; z++) m_ParamsP[4 * x + z] = pThisParam[z]; } if (Double.isNaN(thisMin)) { pThisParam = new double[4]; isRunValid = false; } if (getDebug()) System.err.println("\nNegative exemplars"); nOp = new TLD_Optm(); nOp.setNum(sumN); nOp.setSSquare(varN); nOp.setXBar(meanN); nThisParam = nOp.findArgmin(nThisParam, bounds); while (nThisParam == null) { nThisParam = nOp.getVarbValues(); if (getDebug()) System.err.println("!!! 200 iterations finished, not enough!"); nThisParam = nOp.findArgmin(nThisParam, bounds); } thisMin = nOp.getMinFunction(); if (!Double.isNaN(thisMin) && (thisMin < nminVal)) { nminVal = thisMin; for (int z = 0; z < 4; z++) m_ParamsN[4 * x + z] = nThisParam[z]; } if (Double.isNaN(thisMin)) { nThisParam = new double[4]; isRunValid = false; } if (!isRunValid) { y--; isRunValid = true; } if (++y < m_Run) { // Change the initial parameters and restart int pone = whichEx.nextInt(pnum), // Randomly pick one pos. exmpl. none = whichEx.nextInt(nnum); // Positive exemplars: next run while ((m_SumP[pone][x] <= 1.0) || Double.isNaN(m_MeanP[pone][x])) pone = whichEx.nextInt(pnum); a = m_VarianceP[pone][x] / (m_SumP[pone][x] - 1.0); if (a <= ZERO) a = m_ParamsN[4 * x]; // Change to negative params m = m_MeanP[pone][x]; double sq = (m - m_ParamsP[4 * x + 3]) * (m - m_ParamsP[4 * x + 3]); b = a * m_ParamsP[4 * x + 2] / sq + 2.0; // b=a/Var+2, assuming Var=Sq/w' if ((b <= ZERO) || Double.isNaN(b) || Double.isInfinite(b)) b = m_ParamsN[4 * x + 1]; w = sq * (m_ParamsP[4 * x + 1] - 2.0) / m_ParamsP[4 * x]; // w=Sq/Var, assuming Var=a'/(b'-2) if ((w <= ZERO) || Double.isNaN(w) || Double.isInfinite(w)) w = m_ParamsN[4 * x + 2]; pThisParam[0] = a; // a pThisParam[1] = b; // b pThisParam[2] = w; // w pThisParam[3] = m; // m // Negative exemplars: next run while ((m_SumN[none][x] <= 1.0) || Double.isNaN(m_MeanN[none][x])) none = whichEx.nextInt(nnum); a = m_VarianceN[none][x] / (m_SumN[none][x] - 1.0); if (a <= ZERO) a = m_ParamsP[4 * x]; m = m_MeanN[none][x]; sq = (m - m_ParamsN[4 * x + 3]) * (m - m_ParamsN[4 * x + 3]); b = a * m_ParamsN[4 * x + 2] / sq + 2.0; // b=a/Var+2, assuming Var=Sq/w' if ((b <= ZERO) || Double.isNaN(b) || Double.isInfinite(b)) b = m_ParamsP[4 * x + 1]; w = sq * (m_ParamsN[4 * x + 1] - 2.0) / m_ParamsN[4 * x]; // w=Sq/Var, assuming Var=a'/(b'-2) if ((w <= ZERO) || Double.isNaN(w) || Double.isInfinite(w)) w = m_ParamsP[4 * x + 2]; nThisParam[0] = a; // a nThisParam[1] = b; // b nThisParam[2] = w; // w nThisParam[3] = m; // m } } } for (int x = 0, y = 0; x < m_Dimension; x++, y++) { // if((x==exs.classIndex()) || (x==exs.idIndex())) // y++; a = m_ParamsP[4 * x]; b = m_ParamsP[4 * x + 1]; w = m_ParamsP[4 * x + 2]; m = m_ParamsP[4 * x + 3]; if (getDebug()) System.err.println( "\n\n???Positive: ( " + exs.attribute(1).relation().attribute(y) + "): a=" + a + ", b=" + b + ", w=" + w + ", m=" + m); a = m_ParamsN[4 * x]; b = m_ParamsN[4 * x + 1]; w = m_ParamsN[4 * x + 2]; m = m_ParamsN[4 * x + 3]; if (getDebug()) System.err.println( "???Negative: (" + exs.attribute(1).relation().attribute(y) + "): a=" + a + ", b=" + b + ", w=" + w + ", m=" + m); } if (m_UseEmpiricalCutOff) { // Find the empirical cut-off double[] pLogOdds = new double[pnum], nLogOdds = new double[nnum]; for (int p = 0; p < pnum; p++) pLogOdds[p] = likelihoodRatio(m_SumP[p], m_MeanP[p], m_VarianceP[p]); for (int q = 0; q < nnum; q++) nLogOdds[q] = likelihoodRatio(m_SumN[q], m_MeanN[q], m_VarianceN[q]); // Update m_Cutoff findCutOff(pLogOdds, nLogOdds); } else m_Cutoff = -Math.log((double) pnum / (double) nnum); if (getDebug()) System.err.println("???Cut-off=" + m_Cutoff); }
public void buildClassifier(Instances insts) throws Exception { // Compute mean of target value double yMean = insts.meanOrMode(insts.classIndex()); // Choose best attribute double minMsq = Double.MAX_VALUE; m_attribute = null; int chosen = -1; double chosenSlope = Double.NaN; double chosenIntercept = Double.NaN; for (int i = 0; i < insts.numAttributes(); i++) { if (i != insts.classIndex()) { if (!insts.attribute(i).isNumeric()) { throw new Exception("UnivariateLinearRegression: Only numeric attributes!"); } m_attribute = insts.attribute(i); // Compute slope and intercept double xMean = insts.meanOrMode(i); double sumWeightedXDiffSquared = 0; double sumWeightedYDiffSquared = 0; m_slope = 0; for (int j = 0; j < insts.numInstances(); j++) { Instance inst = insts.instance(j); if (!inst.isMissing(i) && !inst.classIsMissing()) { double xDiff = inst.value(i) - xMean; double yDiff = inst.classValue() - yMean; double weightedXDiff = inst.weight() * xDiff; double weightedYDiff = inst.weight() * yDiff; m_slope += weightedXDiff * yDiff; sumWeightedXDiffSquared += weightedXDiff * xDiff; sumWeightedYDiffSquared += weightedYDiff * yDiff; } } // Skip attribute if not useful if (sumWeightedXDiffSquared == 0) { continue; } double numerator = m_slope; m_slope /= sumWeightedXDiffSquared; m_intercept = yMean - m_slope * xMean; // Compute sum of squared errors double msq = sumWeightedYDiffSquared - m_slope * numerator; // Check whether this is the best attribute if (msq < minMsq) { minMsq = msq; chosen = i; chosenSlope = m_slope; chosenIntercept = m_intercept; } } } // Set parameters if (chosen == -1) { System.err.println("----- no useful attribute found"); m_attribute = null; m_slope = 0; m_intercept = yMean; } else { m_attribute = insts.attribute(chosen); m_slope = chosenSlope; m_intercept = chosenIntercept; } }
/** * Generates the classifier. * * @param data set of instances serving as training data * @throws Exception if the classifier has not been generated successfully */ public void buildClassifier(Instances data) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(data); // remove instances with missing class m_theInstances = new Instances(data); m_theInstances.deleteWithMissingClass(); m_rr = new Random(1); if (m_theInstances.classAttribute().isNominal()) { // Set up class priors m_classPriorCounts = new double[data.classAttribute().numValues()]; Arrays.fill(m_classPriorCounts, 1.0); for (int i = 0; i < data.numInstances(); i++) { Instance curr = data.instance(i); m_classPriorCounts[(int) curr.classValue()] += curr.weight(); } m_classPriors = m_classPriorCounts.clone(); Utils.normalize(m_classPriors); } setUpEvaluator(); if (m_theInstances.classAttribute().isNumeric()) { m_disTransform = new weka.filters.unsupervised.attribute.Discretize(); m_classIsNominal = false; // use binned discretisation if the class is numeric ((weka.filters.unsupervised.attribute.Discretize) m_disTransform).setBins(10); ((weka.filters.unsupervised.attribute.Discretize) m_disTransform).setInvertSelection(true); // Discretize all attributes EXCEPT the class String rangeList = ""; rangeList += (m_theInstances.classIndex() + 1); // System.out.println("The class col: "+m_theInstances.classIndex()); ((weka.filters.unsupervised.attribute.Discretize) m_disTransform) .setAttributeIndices(rangeList); } else { m_disTransform = new weka.filters.supervised.attribute.Discretize(); ((weka.filters.supervised.attribute.Discretize) m_disTransform).setUseBetterEncoding(true); m_classIsNominal = true; } m_disTransform.setInputFormat(m_theInstances); m_theInstances = Filter.useFilter(m_theInstances, m_disTransform); m_numAttributes = m_theInstances.numAttributes(); m_numInstances = m_theInstances.numInstances(); m_majority = m_theInstances.meanOrMode(m_theInstances.classAttribute()); // Perform the search int[] selected = m_search.search(m_evaluator, m_theInstances); m_decisionFeatures = new int[selected.length + 1]; System.arraycopy(selected, 0, m_decisionFeatures, 0, selected.length); m_decisionFeatures[m_decisionFeatures.length - 1] = m_theInstances.classIndex(); // reduce instances to selected features m_delTransform = new Remove(); m_delTransform.setInvertSelection(true); // set features to keep m_delTransform.setAttributeIndicesArray(m_decisionFeatures); m_delTransform.setInputFormat(m_theInstances); m_dtInstances = Filter.useFilter(m_theInstances, m_delTransform); // reset the number of attributes m_numAttributes = m_dtInstances.numAttributes(); // create hash table m_entries = new Hashtable((int) (m_dtInstances.numInstances() * 1.5)); // insert instances into the hash table for (int i = 0; i < m_numInstances; i++) { Instance inst = m_dtInstances.instance(i); insertIntoTable(inst, null); } // Replace the global table majority with nearest neighbour? if (m_useIBk) { m_ibk = new IBk(); m_ibk.buildClassifier(m_theInstances); } // Save memory if (m_saveMemory) { m_theInstances = new Instances(m_theInstances, 0); m_dtInstances = new Instances(m_dtInstances, 0); } m_evaluation = null; }
@Override public void buildClusterer(Instances data) throws Exception { reset(); meanInstance = new DenseInstance(data.numAttributes()); for (int i = 0; i < data.numAttributes(); i++) meanInstance.setValue(i, data.meanOrMode(i)); numInstances = data.numInstances(); kMeans.setDistanceFunction(distanceFunction); kMeans.setMaxIterations(maxIterations); // kMeans.setInitializeUsingKMeansPlusPlusMethod(initializeWithKMeansPlusPlus); if (initializeWithKMeansPlusPlus) { kMeans.setInitializationMethod( new weka.core.SelectedTag(SimpleKMeans.KMEANS_PLUS_PLUS, SimpleKMeans.TAGS_SELECTION)); } /** step 1: iterate over all restarts and possible k values, record CH-scores */ Random r = new Random(m_Seed); double meanCHs[] = new double[maxNumClusters + 1 - minNumClusters]; double maxCHs[] = new double[maxNumClusters + 1 - minNumClusters]; int maxSeed[] = new int[maxNumClusters + 1 - minNumClusters]; for (int i = 0; i < restarts; i++) { if (printDebug) System.out.println("cascade> restarts: " + (i + 1) + " / " + restarts); for (int k = minNumClusters; k <= maxNumClusters; k++) { if (printDebug) System.out.print("cascade> k:" + k + " "); int seed = r.nextInt(); kMeans.setSeed(seed); kMeans.setNumClusters(k); kMeans.buildClusterer(data); double ch = getCalinskiHarabasz(); int index = k - minNumClusters; meanCHs[index] = (meanCHs[index] * i + ch) / (double) (i + 1); if (i == 0 || ch > maxCHs[index]) { maxCHs[index] = ch; maxSeed[index] = seed; } if (printDebug) System.out.println( " CH:" + df.format(ch) + " W:" + df.format( kMeans.getSquaredError() / (double) (numInstances - kMeans.getNumClusters())) + " (unweighted:" + df.format(kMeans.getSquaredError()) + ") B:" + df.format( getSquaredErrorBetweenClusters() / (double) (kMeans.getNumClusters() - 1)) + " (unweighted:" + df.format(getSquaredErrorBetweenClusters()) + ") "); } } if (printDebug) { String s = "cascade> max CH: [ "; for (int i = 0; i < maxSeed.length; i++) s += df.format(maxCHs[i]) + " "; System.out.println(s + "]"); } String s = "cascade> mean CH: [ "; for (int i = 0; i < maxSeed.length; i++) s += df.format(meanCHs[i]) + " "; finalMeanCH = s + "]"; // System.out.println(s + "]"); /** step 2: select k with best mean CH-score; select seed for max CH score for this k */ int bestK = -1; double maxCH = -1; for (int k = minNumClusters; k <= maxNumClusters; k++) { int index = k - minNumClusters; if (bestK == -1 || meanCHs[index] > maxCH) { maxCH = meanCHs[index]; bestK = k; } } if (manuallySelectNumClusters) { int selectedK = selectKManually(meanCHs, bestK); if (selectedK != -1) bestK = selectedK; } int bestSeed = maxSeed[bestK - minNumClusters]; finalBestK = bestK; finalBestSeed = bestSeed; // System.out.println("cascade> k (yields highest mean CH): " + bestK); // System.out.println("cascade> seed (highest CH for k=" + bestK + ") : " + bestSeed); kMeans.setSeed(bestSeed); kMeans.setNumClusters(bestK); kMeans.buildClusterer(data); }
/** * @param exs the training exemplars * @throws Exception if the model cannot be built properly */ public void buildClassifier(Instances exs) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(exs); // remove instances with missing class exs = new Instances(exs); exs.deleteWithMissingClass(); int numegs = exs.numInstances(); m_Dimension = exs.attribute(1).relation().numAttributes(); m_Attribute = exs.attribute(1).relation().stringFreeStructure(); Instances pos = new Instances(exs, 0), neg = new Instances(exs, 0); // Divide into two groups for (int u = 0; u < numegs; u++) { Instance example = exs.instance(u); if (example.classValue() == 1) pos.add(example); else neg.add(example); } int pnum = pos.numInstances(), nnum = neg.numInstances(); // xBar, n m_MeanP = new double[pnum][m_Dimension]; m_SumP = new double[pnum][m_Dimension]; m_MeanN = new double[nnum][m_Dimension]; m_SumN = new double[nnum][m_Dimension]; // w, m m_ParamsP = new double[2 * m_Dimension]; m_ParamsN = new double[2 * m_Dimension]; // \sigma^2 m_SgmSqP = new double[m_Dimension]; m_SgmSqN = new double[m_Dimension]; // S^2 double[][] varP = new double[pnum][m_Dimension], varN = new double[nnum][m_Dimension]; // numOfEx 'e' without all missing double[] effNumExP = new double[m_Dimension], effNumExN = new double[m_Dimension]; // For the starting values double[] pMM = new double[m_Dimension], nMM = new double[m_Dimension], pVM = new double[m_Dimension], nVM = new double[m_Dimension]; // # of exemplars with only one instance double[] numOneInsExsP = new double[m_Dimension], numOneInsExsN = new double[m_Dimension]; // sum_i(1/n_i) double[] pInvN = new double[m_Dimension], nInvN = new double[m_Dimension]; // Extract metadata from both positive and negative bags for (int v = 0; v < pnum; v++) { // Instance px = pos.instance(v); Instances pxi = pos.instance(v).relationalValue(1); for (int k = 0; k < pxi.numAttributes(); k++) { m_MeanP[v][k] = pxi.meanOrMode(k); varP[v][k] = pxi.variance(k); } for (int w = 0, t = 0; w < m_Dimension; w++, t++) { // if((t==m_ClassIndex) || (t==m_IdIndex)) // t++; if (varP[v][w] <= 0.0) varP[v][w] = 0.0; if (!Double.isNaN(m_MeanP[v][w])) { for (int u = 0; u < pxi.numInstances(); u++) if (!pxi.instance(u).isMissing(t)) m_SumP[v][w] += pxi.instance(u).weight(); pMM[w] += m_MeanP[v][w]; pVM[w] += m_MeanP[v][w] * m_MeanP[v][w]; if ((m_SumP[v][w] > 1) && (varP[v][w] > ZERO)) { m_SgmSqP[w] += varP[v][w] * (m_SumP[v][w] - 1.0) / m_SumP[v][w]; // m_SgmSqP[w] += varP[v][w]*(m_SumP[v][w]-1.0); effNumExP[w]++; // Not count exemplars with 1 instance pInvN[w] += 1.0 / m_SumP[v][w]; // pInvN[w] += m_SumP[v][w]; } else numOneInsExsP[w]++; } } } for (int v = 0; v < nnum; v++) { // Instance nx = neg.instance(v); Instances nxi = neg.instance(v).relationalValue(1); for (int k = 0; k < nxi.numAttributes(); k++) { m_MeanN[v][k] = nxi.meanOrMode(k); varN[v][k] = nxi.variance(k); } // Instances nxi = nx.getInstances(); for (int w = 0, t = 0; w < m_Dimension; w++, t++) { // if((t==m_ClassIndex) || (t==m_IdIndex)) // t++; if (varN[v][w] <= 0.0) varN[v][w] = 0.0; if (!Double.isNaN(m_MeanN[v][w])) { for (int u = 0; u < nxi.numInstances(); u++) if (!nxi.instance(u).isMissing(t)) m_SumN[v][w] += nxi.instance(u).weight(); nMM[w] += m_MeanN[v][w]; nVM[w] += m_MeanN[v][w] * m_MeanN[v][w]; if ((m_SumN[v][w] > 1) && (varN[v][w] > ZERO)) { m_SgmSqN[w] += varN[v][w] * (m_SumN[v][w] - 1.0) / m_SumN[v][w]; // m_SgmSqN[w] += varN[v][w]*(m_SumN[v][w]-1.0); effNumExN[w]++; // Not count exemplars with 1 instance nInvN[w] += 1.0 / m_SumN[v][w]; // nInvN[w] += m_SumN[v][w]; } else numOneInsExsN[w]++; } } } // Expected \sigma^2 /* if m_SgmSqP[u] or m_SgmSqN[u] is 0, assign 0 to sigma^2. * Otherwise, may cause k m_SgmSqP / m_SgmSqN to be NaN. * Modified by Lin Dong (Sep. 2005) */ for (int u = 0; u < m_Dimension; u++) { // For exemplars with only one instance, use avg(\sigma^2) of other exemplars if (m_SgmSqP[u] != 0) m_SgmSqP[u] /= (effNumExP[u] - pInvN[u]); else m_SgmSqP[u] = 0; if (m_SgmSqN[u] != 0) m_SgmSqN[u] /= (effNumExN[u] - nInvN[u]); else m_SgmSqN[u] = 0; // m_SgmSqP[u] /= (pInvN[u]-effNumExP[u]); // m_SgmSqN[u] /= (nInvN[u]-effNumExN[u]); effNumExP[u] += numOneInsExsP[u]; effNumExN[u] += numOneInsExsN[u]; pMM[u] /= effNumExP[u]; nMM[u] /= effNumExN[u]; pVM[u] = pVM[u] / (effNumExP[u] - 1.0) - pMM[u] * pMM[u] * effNumExP[u] / (effNumExP[u] - 1.0); nVM[u] = nVM[u] / (effNumExN[u] - 1.0) - nMM[u] * nMM[u] * effNumExN[u] / (effNumExN[u] - 1.0); } // Bounds and parameter values for each run double[][] bounds = new double[2][2]; double[] pThisParam = new double[2], nThisParam = new double[2]; // Initial values for parameters double w, m; Random whichEx = new Random(m_Seed); // Optimize for one dimension for (int x = 0; x < m_Dimension; x++) { // System.out.println("\n\n!!!!!!!!!!!!!!!!!!!!!!???Dimension #"+x); // Positive examplars: first run pThisParam[0] = pVM[x]; // w if (pThisParam[0] <= ZERO) pThisParam[0] = 1.0; pThisParam[1] = pMM[x]; // m // Negative examplars: first run nThisParam[0] = nVM[x]; // w if (nThisParam[0] <= ZERO) nThisParam[0] = 1.0; nThisParam[1] = nMM[x]; // m // Bound constraints bounds[0][0] = ZERO; // w > 0 bounds[0][1] = Double.NaN; bounds[1][0] = Double.NaN; bounds[1][1] = Double.NaN; double pminVal = Double.MAX_VALUE, nminVal = Double.MAX_VALUE; TLDSimple_Optm pOp = null, nOp = null; boolean isRunValid = true; double[] sumP = new double[pnum], meanP = new double[pnum]; double[] sumN = new double[nnum], meanN = new double[nnum]; // One dimension for (int p = 0; p < pnum; p++) { sumP[p] = m_SumP[p][x]; meanP[p] = m_MeanP[p][x]; } for (int q = 0; q < nnum; q++) { sumN[q] = m_SumN[q][x]; meanN[q] = m_MeanN[q][x]; } for (int y = 0; y < m_Run; y++) { // System.out.println("\n\n!!!!!!!!!Positive exemplars: Run #"+y); double thisMin; pOp = new TLDSimple_Optm(); pOp.setNum(sumP); pOp.setSgmSq(m_SgmSqP[x]); if (getDebug()) System.out.println("m_SgmSqP[" + x + "]= " + m_SgmSqP[x]); pOp.setXBar(meanP); // pOp.setDebug(true); pThisParam = pOp.findArgmin(pThisParam, bounds); while (pThisParam == null) { pThisParam = pOp.getVarbValues(); if (getDebug()) System.out.println("!!! 200 iterations finished, not enough!"); pThisParam = pOp.findArgmin(pThisParam, bounds); } thisMin = pOp.getMinFunction(); if (!Double.isNaN(thisMin) && (thisMin < pminVal)) { pminVal = thisMin; for (int z = 0; z < 2; z++) m_ParamsP[2 * x + z] = pThisParam[z]; } if (Double.isNaN(thisMin)) { pThisParam = new double[2]; isRunValid = false; } if (!isRunValid) { y--; isRunValid = true; } // Change the initial parameters and restart int pone = whichEx.nextInt(pnum); // Positive exemplars: next run while (Double.isNaN(m_MeanP[pone][x])) pone = whichEx.nextInt(pnum); m = m_MeanP[pone][x]; w = (m - pThisParam[1]) * (m - pThisParam[1]); pThisParam[0] = w; // w pThisParam[1] = m; // m } for (int y = 0; y < m_Run; y++) { // System.out.println("\n\n!!!!!!!!!Negative exemplars: Run #"+y); double thisMin; nOp = new TLDSimple_Optm(); nOp.setNum(sumN); nOp.setSgmSq(m_SgmSqN[x]); if (getDebug()) System.out.println(m_SgmSqN[x]); nOp.setXBar(meanN); // nOp.setDebug(true); nThisParam = nOp.findArgmin(nThisParam, bounds); while (nThisParam == null) { nThisParam = nOp.getVarbValues(); if (getDebug()) System.out.println("!!! 200 iterations finished, not enough!"); nThisParam = nOp.findArgmin(nThisParam, bounds); } thisMin = nOp.getMinFunction(); if (!Double.isNaN(thisMin) && (thisMin < nminVal)) { nminVal = thisMin; for (int z = 0; z < 2; z++) m_ParamsN[2 * x + z] = nThisParam[z]; } if (Double.isNaN(thisMin)) { nThisParam = new double[2]; isRunValid = false; } if (!isRunValid) { y--; isRunValid = true; } // Change the initial parameters and restart int none = whichEx.nextInt(nnum); // Randomly pick one pos. exmpl. // Negative exemplars: next run while (Double.isNaN(m_MeanN[none][x])) none = whichEx.nextInt(nnum); m = m_MeanN[none][x]; w = (m - nThisParam[1]) * (m - nThisParam[1]); nThisParam[0] = w; // w nThisParam[1] = m; // m } } m_LkRatio = new double[m_Dimension]; if (m_UseEmpiricalCutOff) { // Find the empirical cut-off double[] pLogOdds = new double[pnum], nLogOdds = new double[nnum]; for (int p = 0; p < pnum; p++) pLogOdds[p] = likelihoodRatio(m_SumP[p], m_MeanP[p]); for (int q = 0; q < nnum; q++) nLogOdds[q] = likelihoodRatio(m_SumN[q], m_MeanN[q]); // Update m_Cutoff findCutOff(pLogOdds, nLogOdds); } else m_Cutoff = -Math.log((double) pnum / (double) nnum); /* for(int x=0, y=0; x<m_Dimension; x++, y++){ if((x==exs.classIndex()) || (x==exs.idIndex())) y++; w=m_ParamsP[2*x]; m=m_ParamsP[2*x+1]; System.err.println("\n\n???Positive: ( "+exs.attribute(y)+ "): w="+w+", m="+m+", sgmSq="+m_SgmSqP[x]); w=m_ParamsN[2*x]; m=m_ParamsN[2*x+1]; System.err.println("???Negative: ("+exs.attribute(y)+ "): w="+w+", m="+m+", sgmSq="+m_SgmSqN[x]+ "\nAvg. log-likelihood ratio in training data=" +(m_LkRatio[x]/(pnum+nnum))); } */ if (getDebug()) System.err.println("\n\n???Cut-off=" + m_Cutoff); }