// This takes an inordinate amount of time. -jdramsey 20150929 private int[] getNonMissingRows(Node x, Node y, List<Node> z) { // List<Integer> rows = new ArrayList<Integer>(); // // I: // for (int i = 0; i < internalData.getNumRows(); i++) { // for (Node node : variablesPerNode.get(x)) { // if (isMissing(node, i)) continue I; // } // // for (Node node : variablesPerNode.get(y)) { // if (isMissing(node, i)) continue I; // } // // for (Node _z : z) { // for (Node node : variablesPerNode.get(_z)) { // if (isMissing(node, i)) continue I; // } // } // // rows.add(i); // } // int[] _rows = new int[rows.size()]; // for (int k = 0; k < rows.size(); k++) _rows[k] = rows.get(k); if (_rows == null) { _rows = new int[internalData.getNumRows()]; for (int k = 0; k < _rows.length; k++) _rows[k] = k; } return _rows; }
/** * Constructs a test using a given data set. If a data set is provided (that is, a tabular data * set), fourth moment statistics can be calculated (p. 160); otherwise, it must be assumed that * the data are multivariate Gaussian. */ public DeltaSextadTest(DataSet dataSet) { if (dataSet == null) { throw new NullPointerException(); } if (!dataSet.isContinuous()) { throw new IllegalArgumentException(); } this.cov = new CovarianceMatrix(dataSet); List<DataSet> data1 = new ArrayList<DataSet>(); data1.add(dataSet); List<DataSet> data2 = DataUtils.center(data1); this.dataSet = data2.get(0); this.data = this.dataSet.getDoubleData().transpose().toArray(); this.N = dataSet.getNumRows(); this.variables = dataSet.getVariables(); this.numVars = dataSet.getNumColumns(); this.variablesHash = new HashMap<Node, Integer>(); for (int i = 0; i < variables.size(); i++) { variablesHash.put(variables.get(i), i); } this.means = new double[numVars]; for (int i = 0; i < numVars; i++) { means[i] = mean(data[i], N); } }
/* * @param dataSet A discrete data set. * @param column the column in question. * @return the max value in that column. */ private int maxInColumn(DataSet dataSet, int column) { int max = -1; for (int i = 0; i < dataSet.getNumRows(); i++) { int value = dataSet.getInt(i, column); if (value > max) max = value; } return max; }
/** @return the splitNames selected by the editor. */ public static DataModel createSplits(DataSet dataSet, SplitCasesParams params) { List<Integer> indices = new ArrayList<Integer>(dataSet.getNumRows()); for (int i = 0; i < dataSet.getNumRows(); i++) { indices.add(i); } if (params.isDataShuffled()) { Collections.shuffle(indices); } SplitCasesSpec spec = params.getSpec(); int numSplits = params.getNumSplits(); int sampleSize = spec.getSampleSize(); int[] breakpoints = spec.getBreakpoints(); List<String> splitNames = spec.getSplitNames(); int[] _breakpoints = new int[breakpoints.length + 2]; _breakpoints[0] = 0; _breakpoints[_breakpoints.length - 1] = sampleSize; System.arraycopy(breakpoints, 0, _breakpoints, 1, breakpoints.length); DataModelList list = new DataModelList(); int ncols = dataSet.getNumColumns(); for (int n = 0; n < numSplits; n++) { int _sampleSize = _breakpoints[n + 1] - _breakpoints[n]; DataSet _data = new ColtDataSet(_sampleSize, dataSet.getVariables()); _data.setName(splitNames.get(n)); for (int i = 0; i < _sampleSize; i++) { int oldCase = indices.get(i + _breakpoints[n]); for (int j = 0; j < ncols; j++) { _data.setObject(i, j, dataSet.getObject(oldCase, j)); } } list.add(_data); } return list; }
private void setDataSet(DataSet dataSet) { List<String> _varNames = dataSet.getVariableNames(); this.variables = dataSet.getVariables(); this.dataSet = dataSet; this.discrete = dataSet.isDiscrete(); if (!isDiscrete()) { this.covariances = new CovarianceMatrix(dataSet); } this.sampleSize = dataSet.getNumRows(); }
/** Creates a cell count table for the given data set. */ public DataSetProbs(DataSet dataSet) { if (dataSet == null) { throw new NullPointerException(); } this.dataSet = dataSet; dims = new int[dataSet.getNumColumns()]; for (int i = 0; i < dims.length; i++) { DiscreteVariable variable = (DiscreteVariable) dataSet.getVariable(i); dims[i] = variable.getNumCategories(); } numRows = dataSet.getNumRows(); }
private List<Node> expandVariable(DataSet dataSet, Node node) { if (node instanceof ContinuousVariable) { return Collections.singletonList(node); } if (node instanceof DiscreteVariable && ((DiscreteVariable) node).getNumCategories() < 3) { return Collections.singletonList(node); } if (!(node instanceof DiscreteVariable)) { throw new IllegalArgumentException(); } List<String> varCats = new ArrayList<String>(((DiscreteVariable) node).getCategories()); // first category is reference varCats.remove(0); List<Node> variables = new ArrayList<Node>(); for (String cat : varCats) { Node newVar; do { String newVarName = node.getName() + "MULTINOM" + "." + cat; newVar = new DiscreteVariable(newVarName, 2); } while (dataSet.getVariable(newVar.getName()) != null); variables.add(newVar); dataSet.addVariable(newVar); int newVarIndex = dataSet.getColumn(newVar); int numCases = dataSet.getNumRows(); for (int l = 0; l < numCases; l++) { Object dataCell = dataSet.getObject(l, dataSet.getColumn(node)); int dataCellIndex = ((DiscreteVariable) node).getIndex(dataCell.toString()); if (dataCellIndex == ((DiscreteVariable) node).getIndex(cat)) dataSet.setInt(l, newVarIndex, 1); else dataSet.setInt(l, newVarIndex, 0); } } return variables; }
private int sampleSize() { return dataSet.getNumRows(); }
private double[] dependencePvalsLogit(Node x, Node y, List<Node> z) { if (!variablesPerNode.containsKey(x)) { throw new IllegalArgumentException("Unrecogized node: " + x); } if (!variablesPerNode.containsKey(y)) { throw new IllegalArgumentException("Unrecogized node: " + y); } for (Node node : z) { if (!variablesPerNode.containsKey(node)) { throw new IllegalArgumentException("Unrecogized node: " + node); } } List<Double> pValues = new ArrayList<Double>(); int[] _rows = getNonMissingRows(x, y, z); logisticRegression.setRows(_rows); List<Node> yzDumList = new ArrayList<>(); List<Node> yzList = new ArrayList<>(); yzList.add(y); yzList.addAll(z); // List<Node> zList = new ArrayList<>(); yzDumList.addAll(variablesPerNode.get(y)); for (Node _z : z) { yzDumList.addAll(variablesPerNode.get(_z)); // zList.addAll(variablesPerNode.get(_z)); } // double[][] coeffsDep = new double[variablesPerNode.get(x).size()][]; // DoubleMatrix2D coeffsNull = DoubleFactory2D.dense.make(zList.size(), // variablesPerNode.get(x).size()); // DoubleMatrix2D coeffsDep = DoubleFactory2D.dense.make(yzDumList.size()+1, // variablesPerNode.get(x).size()); double[] sumLnP = new double[yzList.size()]; for (int i = 0; i < sumLnP.length; i++) sumLnP[i] = 0.0; for (int i = 0; i < variablesPerNode.get(x).size(); i++) { Node _x = variablesPerNode.get(x).get(i); LogisticRegression.Result result1 = logisticRegression.regress((DiscreteVariable) _x, yzDumList); int n = originalData.getNumRows(); int k = yzDumList.size(); // skip intercept at index 0 int coefIndex = 1; for (int j = 0; j < yzList.size(); j++) { for (int dum = 0; dum < variablesPerNode.get(yzList.get(j)).size(); dum++) { double wald = Math.abs(result1.getCoefs()[coefIndex] / result1.getStdErrs()[coefIndex]); // double val = (1.0 - new // NormalDistribution(0,1).cumulativeProbability(wald))*2;//two-tailed test // double val = 1-result1.getProbs()[i+1]; // this is exactly the same test as the linear case double val = (1.0 - ProbUtils.tCdf(wald, n - k)) * 2; // System.out.println(_x.getName() + "\t" + yzDumList.get(coefIndex-1).getName() + "\t" + // val + "\t" + (n-k)); // if(val <= 0) System.out.println("Zero p-val t-test: p " + val + " stat " + wald + " k " // + k + " n " + n); sumLnP[j] += Math.log(val); coefIndex++; } } } double[] pVec = new double[sumLnP.length]; for (int i = 0; i < pVec.length; i++) { if (sumLnP[i] == Double.NEGATIVE_INFINITY) pVec[i] = 0.0; else { int df = 2 * variablesPerNode.get(x).size() * variablesPerNode.get(yzList.get(i)).size(); pVec[i] = 1.0 - new ChiSquaredDistribution(df).cumulativeProbability(-2 * sumLnP[i]); } } return pVec; }
private void createDiscreteTimeSeriesData() { // GIVEN: Continuous data set D, maximum lag m. Node[] dataVars = dataSet.getVariables().toArray(new Node[0]); int n = dataVars.length; int m = getNumLags(); // LetXi, i = 0,...,n-1, be the variables from the data. Let Xi(t) be // the variable Xi at time lag t (before 0), t = 0,...,m. Node[][] laggedVars = new Node[m + 1][n]; Knowledge knowledge = new Knowledge(); for (int s = 0; s <= m; s++) { for (int j = 0; j < n; j++) { String name1 = dataVars[j].getName(); String name2 = name1 + "." + (s + 1); laggedVars[s][j] = new DiscreteVariable((DiscreteVariable) dataVars[j]); laggedVars[s][j].setName(name2); laggedVars[s][j].setCenter(80 * j + 50, 80 * (m - s) + 50); knowledge.addToTier(s, laggedVars[s][j].getName()); } } // 2. Prepare the data the way you did. List<Node> variables = new LinkedList<Node>(); for (int s = 0; s <= m; s++) { for (int i = 0; i < n; i++) { int[] rawData = new int[dataSet.getNumRows()]; for (int j = 0; j < dataSet.getNumRows(); j++) { rawData[j] = dataSet.getInt(j, i); } int size = dataSet.getNumRows(); int[] laggedRaw = new int[size - m + 1]; System.arraycopy(rawData, m - s, laggedRaw, 0, size - m + 1); variables.add(laggedVars[s][i]); } } DataSet _laggedData = new ColtDataSet(dataSet.getNumRows() - m + 1, variables); for (int s = 0; s <= m; s++) { for (int i = 0; i < n; i++) { int[] rawData = new int[dataSet.getNumRows()]; for (int j = 0; j < dataSet.getNumRows(); j++) { rawData[j] = dataSet.getInt(j, i); } int size = dataSet.getNumRows(); int[] laggedRaw = new int[size - m + 1]; System.arraycopy(rawData, m - s, laggedRaw, 0, size - m + 1); int _col = _laggedData.getColumn(laggedVars[s][i]); for (int j = 0; j < dataSet.getNumRows(); j++) { _laggedData.setInt(j, _col, laggedRaw[j]); } } } knowledge.setDefaultToKnowledgeLayout(true); _laggedData.setKnowledge(knowledge); DataModelList list = new DataModelList(); list.add(_laggedData); getDataEditor().reset(list); getDataEditor().selectLastTab(); }
protected SemIm estimateCoeffs(SemIm semIm) { // System.out.print("\n****************\nCalling 2SLS... "); SemGraph semGraph = semIm.getSemPm().getGraph(); // Get list of fixed measurements that will be kept fixed, and the // respective latent variables that are their parents. // "X" variables are exogenous, while "Y" variables are endogenous. List<Node> ly = new LinkedList<Node>(); List<Node> lx = new LinkedList<Node>(); List<Node> my1 = new LinkedList<Node>(); List<Node> mx1 = new LinkedList<Node>(); List<Node> observed = new LinkedList<Node>(); for (Node nodeA : semGraph.getNodes()) { if (nodeA.getNodeType() == NodeType.ERROR) { continue; } if (nodeA.getNodeType() == NodeType.LATENT) { if (semGraph.getParents(nodeA).size() == 0) { lx.add(nodeA); } else { ly.add(nodeA); } } else { observed.add(nodeA); } } setFixedNodes(semGraph, mx1, my1); // ------------------------------------------------------------------ // Estimate freeParameters for the latent/latent edges for (Node current : ly) { if (nodeName != null && !nodeName.equals(current.getName())) { continue; } // Build Z, the matrix containing the data for the fixed measurements // associated with the parents of the getModel (endogenous) latent node List<Node> endo_parents_m = new LinkedList<Node>(); List<Node> exo_parents_m = new LinkedList<Node>(); List<Node> endo_parents = new LinkedList<Node>(); List<Node> exo_parents = new LinkedList<Node>(); Iterator<Node> it_p = semGraph.getParents(current).iterator(); lNames = new String[lx.size() + ly.size()]; while (it_p.hasNext()) { Node node = it_p.next(); if (node.getNodeType() == NodeType.ERROR) { continue; } if (lx.contains(node)) { int position = lx.indexOf(node); exo_parents_m.add(mx1.get(position)); exo_parents.add(node); } else { int position = ly.indexOf(node); endo_parents_m.add(my1.get(position)); endo_parents.add(node); } } Object endp_a_m[] = endo_parents_m.toArray(); Object exop_a_m[] = exo_parents_m.toArray(); Object endp_a[] = endo_parents.toArray(); Object exop_a[] = exo_parents.toArray(); int n = dataSet.getNumRows(), c = endp_a_m.length + exop_a_m.length; if (c == 0) { continue; } double Z[][] = new double[n][c]; int count = 0; for (int i = 0; i < endp_a_m.length; i++) { Node node = (Node) endp_a_m[i]; String name = node.getName(); Node variable = dataSet.getVariable(name); int colIndex = dataSet.getVariables().indexOf(variable); // Column column = dataSet.getColumnObject(variable); // double column_data[] = (double[]) column.getRawData(); for (int j = 0; j < n; j++) { // Z[j][i] = column_data[j]; Z[j][i] = dataSet.getDouble(j, colIndex); } lNames[count++] = (endo_parents.get(i)).getName(); } for (int i = 0; i < exop_a_m.length; i++) { Node node = (Node) exop_a_m[i]; String name = node.getName(); Node variable = dataSet.getVariable(name); int colIndex = dataSet.getVariables().indexOf(variable); // Column column = dataSet.getColumnObject(variable); // double column_data[] = (double[]) column.getRawData(); for (int j = 0; j < n; j++) { // Z[j][endp_a_m.length + i] = column_data[j]; Z[j][endp_a_m.length + i] = dataSet.getDouble(j, colIndex); } lNames[count++] = exo_parents.get(i).getName(); } // Build V, the matrix containing the data for the nonfixed measurements // associated with the parents of the getModel (endogenous) latent node endo_parents_m = new LinkedList<Node>(); exo_parents_m = new LinkedList<Node>(); it_p = semGraph.getParents(current).iterator(); while (it_p.hasNext()) { Node node = it_p.next(); if (node.getNodeType() == NodeType.ERROR) { continue; } List<Node> other_measures = new LinkedList<Node>(); for (Node next : semGraph.getChildren(node)) { if (next.getNodeType() == NodeType.MEASURED) { other_measures.add(next); } } if (lx.contains(node)) { int position = lx.indexOf(node); other_measures.remove(mx1.get(position)); exo_parents_m.addAll(other_measures); } else { int position = ly.indexOf(node); other_measures.remove(my1.get(position)); endo_parents_m.addAll(other_measures); } } endp_a_m = endo_parents_m.toArray(); exop_a_m = exo_parents_m.toArray(); n = dataSet.getNumRows(); c = endp_a_m.length + exop_a_m.length; double V[][] = new double[n][c]; if (c == 0) { continue; } for (int i = 0; i < endp_a_m.length; i++) { Node node = ((Node) endp_a_m[i]); String name = node.getName(); Node variable = dataSet.getVariable(name); int colIndex = dataSet.getVariables().indexOf(variable); // Column column = dataSet.getColumnObject(variable); // double column_data[] = (double[]) column.getRawData(); for (int j = 0; j < n; j++) { // V[j][i] = column_data[j]; V[j][i] = dataSet.getDouble(j, colIndex); } } for (int i = 0; i < exop_a_m.length; i++) { Node node = (Node) exop_a_m[i]; String name = node.getName(); Node variable = dataSet.getVariable(name); int colIndex = dataSet.getVariables().indexOf(variable); // Column column = dataSet.getColumnObject(variable); // double column_data[] = (double[]) column.getRawData(); for (int j = 0; j < n; j++) { // V[j][endp_a_m.length + i] = column_data[j]; V[j][endp_a_m.length + i] = dataSet.getDouble(j, colIndex); } } double yi[] = new double[n]; if (lx.contains(current)) { int position = lx.indexOf(current); Node node = mx1.get(position); String name = node.getName(); Node variable = dataSet.getVariable(name); int colIndex = dataSet.getVariables().indexOf(variable); // Column column = dataSet.getColumnObject(variable); // // System.arraycopy(column.getRawData(), 0, yi, 0, n); for (int i = 0; i < n; i++) { yi[i] = dataSet.getDouble(i, colIndex); } } else { int position = ly.indexOf(current); Node node = my1.get(position); String name = node.getName(); Node variable = dataSet.getVariable(name); int colIndex = dataSet.getVariables().indexOf(variable); // System.arraycopy(dataSet.getColumnObject(variable).getRawData(), 0, yi, 0, // n); for (int i = 0; i < n; i++) { yi[i] = dataSet.getDouble(i, colIndex); } } // Build Z_hat double Z_hat[][] = MatrixUtils.product( V, MatrixUtils.product( MatrixUtils.inverse(MatrixUtils.product(MatrixUtils.transpose(V), V)), MatrixUtils.product(MatrixUtils.transpose(V), Z))); A_hat = MatrixUtils.product( MatrixUtils.inverse(MatrixUtils.product(MatrixUtils.transpose(Z_hat), Z_hat)), MatrixUtils.product(MatrixUtils.transpose(Z_hat), yi)); // Set the edge for the fixed measurement int position = ly.indexOf(current); semIm.setParamValue(current, my1.get(position), 1.); // Set the edge for the latents for (int i = 0; i < endp_a.length; i++) { semIm.setParamValue((Node) endp_a[i], current, A_hat[i]); } for (int i = 0; i < exop_a.length; i++) { semIm.setParamValue((Node) exop_a[i], current, A_hat[endp_a.length + i]); } if (nodeName != null && nodeName.equals(current.getName())) { computeAsymptLatentCovar(yi, A_hat, Z, Z_hat, dataSet.getNumRows()); break; } } // ------------------------------------------------------------------ // Estimate freeParameters of the measurement model // Set the edges of the fixed measurements of exogenous for (Node current : lx) { int position = lx.indexOf(current); semIm.setParamValue(current, mx1.get(position), 1.); } for (Node current : observed) { if (nodeName != null && !nodeName.equals(current.getName())) { continue; } if (mx1.contains(current) || my1.contains(current)) { continue; } // First, get the parent of this observed Node current_latent = null; for (Node node : semGraph.getParents(current)) { if (node.getNodeType() == NodeType.ERROR) { continue; } current_latent = node; } Iterator<Node> children = semGraph.getChildren(current_latent).iterator(); List<Node> other_measures = new LinkedList<Node>(); Node fixed_measurement; while (children.hasNext()) { Node next = children.next(); if ((next.getNodeType() == NodeType.MEASURED) && next != current) { other_measures.add(next); } } if (lx.contains(current_latent)) { int position = lx.indexOf(current_latent); other_measures.remove(mx1.get(position)); fixed_measurement = mx1.get(position); } else { int position = ly.indexOf(current_latent); other_measures.remove(my1.get(position)); fixed_measurement = my1.get(position); } // Regress other_measures over the fixed measurement x1 (y1) correspondent // to the measurement variable that is being evaluated int n = dataSet.getNumRows(), c = other_measures.size(); if (c == 0) { continue; } double Z[][] = new double[n][c]; for (int i = 0; i < c; i++) { Node variable = dataSet.getVariable((other_measures.get(i)).getName()); int varIndex = dataSet.getVariables().indexOf(variable); // Column column = dataSet.getColumnObject(variable); // double column_data[] = (double[]) column.getRawData(); for (int j = 0; j < n; j++) { // Z[j][i] = column_data[j]; Z[j][i] = dataSet.getDouble(varIndex, j); } } // Build C, the column matrix containing the data for the fixed // measurement associated with the only latent parent of the getModel // observed node (as assumed by the structure of our measurement model). Node variable = dataSet.getVariable(fixed_measurement.getName()); int colIndex = dataSet.getVariables().indexOf(variable); // Column column = dataSet.getColumnObject(variable); // double C[] = (double[]) column.getRawData(); double[] C = new double[dataSet.getNumRows()]; for (int i = 0; i < dataSet.getNumRows(); i++) { C[i] = dataSet.getDouble(colIndex, i); } // Build V, the matrix containing the data for the other measurements // associated with the parents of the (latent) parent of getModel // observed node. The only difference with respect to the estimation // of the within-latent coefficients is that here we only include // the other measurements attached to the parent of the getModel node, // assuming that the error term of the getModel node is independent // of the error term of the others and that each measurement is // taken with respect to only one latent. n = dataSet.getNumRows(); c = other_measures.size(); double V[][] = new double[n][c]; for (int i = 0; i < c; i++) { Node variable2 = dataSet.getVariable((other_measures.get(i)).getName()); int var2index = dataSet.getVariables().indexOf(variable2); // Column column = dataSet.getColumnObject(variable2); // double column_data[] = (double[]) column.getRawData(); for (int j = 0; j < n; j++) { // V[j][i] = column_data[j]; V[j][i] = dataSet.getDouble(j, var2index); } } double yi[] = new double[n]; Node variable3 = dataSet.getVariable((current).getName()); int var3Index = dataSet.getVariables().indexOf(variable3); for (int i = 0; i < n; i++) { yi[i] = dataSet.getDouble(i, var3Index); } // Object rawData = dataSet.getColumnObject(variable3).getRawData(); // System.arraycopy(rawData, 0, yi, 0, n); double C_hat[] = MatrixUtils.product( V, MatrixUtils.product( MatrixUtils.inverse(MatrixUtils.product(MatrixUtils.transpose(V), V)), MatrixUtils.product(MatrixUtils.transpose(V), C))); double A_hat = MatrixUtils.innerProduct( MatrixUtils.scalarProduct(1. / MatrixUtils.innerProduct(C_hat, C_hat), C_hat), yi); // Set the edge for the getModel measurement semIm.setParamValue(current_latent, current, A_hat); } return semIm; }
/** * This method takes an instantiated Bayes net (BayesIm) whose graph include all the variables * (observed and latent) and computes estimated counts using the data in the DataSet mixedData. * The counts that are estimated correspond to cells in the conditional probability tables of the * Bayes net. The outermost loop (indexed by j) is over the set of variables. If the variable has * no parents, each case in the dataset is examined and the count for the observed value of the * variables is increased by 1.0; if the value of the variable is missing the marginal * probabilities its values given the values of the variables that are available for that case are * used to increment the corresponding estimated counts. If a variable has parents then there is a * loop which steps through all possible sets of values of its parents. This loop is indexed by * the variable "row". Each case in the dataset is examined. It the variable and all its parents * have values in the case the corresponding estimated counts are incremented by 1.0. If the * variable or any of its parents have missing values, the joint marginal is computed for the * variable and the set of values of its parents corresponding to "row" and the corresponding * estimated counts are incremented by the appropriate probability. The estimated counts are * stored in the double[][][] array estimatedCounts. The count (possibly fractional) of the number * of times each combination of parent values occurs is stored in the double[][] array * estimatedCountsDenom. These two arrays are used to compute the estimated conditional * probabilities of the output Bayes net. */ private BayesIm expectation(BayesIm inputBayesIm) { // System.out.println("Entered method expectation."); int numCases = mixedData.getNumRows(); // StoredCellEstCounts estCounts = new StoredCellEstCounts(variables); int numVariables = allVariables.size(); RowSummingExactUpdater rseu = new RowSummingExactUpdater(inputBayesIm); for (int j = 0; j < numVariables; j++) { DiscreteVariable var = (DiscreteVariable) allVariables.get(j); String varName = var.getName(); Node varNode = graph.getNode(varName); int varIndex = inputBayesIm.getNodeIndex(varNode); int[] parentVarIndices = inputBayesIm.getParents(varIndex); // System.out.println("graph = " + graph); // for(int col = 0; col < var.getNumSplits(); col++) // System.out.println("Category " + col + " = " + var.getCategory(col)); // System.out.println("Updating estimated counts for node " + varName); // This segment is for variables with no parents: if (parentVarIndices.length == 0) { // System.out.println("No parents"); for (int col = 0; col < var.getNumCategories(); col++) { estimatedCounts[j][0][col] = 0.0; } for (int i = 0; i < numCases; i++) { // System.out.println("Case " + i); // If this case has a value for var if (mixedData.getInt(i, j) != -99) { estimatedCounts[j][0][mixedData.getInt(i, j)] += 1.0; // System.out.println("Adding 1.0 to " + varName + // " row 0 category " + mixedData[j][i]); } else { // find marginal probability, given obs data in this case, p(v=0) Evidence evidenceThisCase = Evidence.tautology(inputBayesIm); boolean existsEvidence = false; // Define evidence for updating by using the values of the other vars. for (int k = 0; k < numVariables; k++) { if (k == j) { continue; } Node otherVar = allVariables.get(k); if (mixedData.getInt(i, k) == -99) { continue; } existsEvidence = true; String otherVarName = otherVar.getName(); Node otherNode = graph.getNode(otherVarName); int otherIndex = inputBayesIm.getNodeIndex(otherNode); evidenceThisCase.getProposition().setCategory(otherIndex, mixedData.getInt(i, k)); } if (!existsEvidence) { continue; // No other variable contained useful data } rseu.setEvidence(evidenceThisCase); for (int m = 0; m < var.getNumCategories(); m++) { estimatedCounts[j][0][m] += rseu.getMarginal(varIndex, m); // System.out.println("Adding " + p + " to " + varName + // " row 0 category " + m); // find marginal probability, given obs data in this case, p(v=1) // estimatedCounts[j][0][1] += 0.5; } } } // Print estimated counts: // System.out.println("Estimated counts: "); // Print counts for each value of this variable with no parents. // for(int m = 0; m < var.getNumSplits(); m++) // System.out.print(" " + m + " " + estimatedCounts[j][0][m]); // System.out.println(); } else { // For variables with parents: int numRows = inputBayesIm.getNumRows(varIndex); for (int row = 0; row < numRows; row++) { int[] parValues = inputBayesIm.getParentValues(varIndex, row); estimatedCountsDenom[varIndex][row] = 0.0; for (int col = 0; col < var.getNumCategories(); col++) { estimatedCounts[varIndex][row][col] = 0.0; } for (int i = 0; i < numCases; i++) { // for a case where the parent values = parValues increment the estCount boolean parentMatch = true; for (int p = 0; p < parentVarIndices.length; p++) { if (parValues[p] != mixedData.getInt(i, parentVarIndices[p]) && mixedData.getInt(i, parentVarIndices[p]) != -99) { parentMatch = false; break; } } if (!parentMatch) { continue; // Not a matching case; go to next. } boolean parentMissing = false; for (int parentVarIndice : parentVarIndices) { if (mixedData.getInt(i, parentVarIndice) == -99) { parentMissing = true; break; } } if (mixedData.getInt(i, j) != -99 && !parentMissing) { estimatedCounts[j][row][mixedData.getInt(i, j)] += 1.0; estimatedCountsDenom[j][row] += 1.0; continue; // Next case } // for a case with missing data (either var or one of its parents) // compute the joint marginal // distribution for var & this combination of values of its parents // and update the estCounts accordingly // To compute marginals create the evidence boolean existsEvidence = false; Evidence evidenceThisCase = Evidence.tautology(inputBayesIm); // "evidenceVars" not used. // List<String> evidenceVars = new LinkedList<String>(); // for (int k = 0; k < numVariables; k++) { // //if(k == j) continue; // Variable otherVar = allVariables.get(k); // if (mixedData.getInt(i, k) == -99) { // continue; // } // existsEvidence = true; // String otherVarName = otherVar.getName(); // Node otherNode = graph.getNode(otherVarName); // int otherIndex = inputBayesIm.getNodeIndex( // otherNode); // evidenceThisCase.getProposition().setCategory( // otherIndex, mixedData.getInt(i, k)); // evidenceVars.add(otherVarName); // } if (!existsEvidence) { continue; } rseu.setEvidence(evidenceThisCase); estimatedCountsDenom[j][row] += rseu.getJointMarginal(parentVarIndices, parValues); int[] parPlusChildIndices = new int[parentVarIndices.length + 1]; int[] parPlusChildValues = new int[parentVarIndices.length + 1]; parPlusChildIndices[0] = varIndex; for (int pc = 1; pc < parPlusChildIndices.length; pc++) { parPlusChildIndices[pc] = parentVarIndices[pc - 1]; parPlusChildValues[pc] = parValues[pc - 1]; } for (int m = 0; m < var.getNumCategories(); m++) { parPlusChildValues[0] = m; /* if(varName.equals("X1") && i == 0 ) { System.out.println("Calling getJointMarginal with parvalues"); for(int k = 0; k < parPlusChildIndices.length; k++) { int pIndex = parPlusChildIndices[k]; Node pNode = inputBayesIm.getNode(pIndex); String pName = pNode.getName(); System.out.println(pName + " " + parPlusChildValues[k]); } } */ /* if(varName.equals("X1") && i == 0 ) { System.out.println("Evidence = " + evidenceThisCase); //int[] vars = {l1Index, x1Index}; Node nodex1 = inputBayesIm.getNode("X1"); int x1Index = inputBayesIm.getNodeIndex(nodex1); Node nodel1 = inputBayesIm.getNode("L1"); int l1Index = inputBayesIm.getNodeIndex(nodel1); int[] vars = {l1Index, x1Index}; int[] vals = {0, 0}; double ptest = rseu.getJointMarginal(vars, vals); System.out.println("Joint marginal (X1=0, L1 = 0) = " + p); } */ estimatedCounts[j][row][m] += rseu.getJointMarginal(parPlusChildIndices, parPlusChildValues); // System.out.println("Case " + i + " parent values "); // for (int pp = 0; pp < parentVarIndices.length; pp++) { // Variable par = (Variable) allVariables.get(parentVarIndices[pp]); // System.out.print(" " + par.getName() + " " + parValues[pp]); // } // System.out.println(); // System.out.println("Adding " + p + " to " + varName + // " row " + row + " category " + m); } // } } // Print estimated counts: // System.out.println("Estimated counts: "); // System.out.println(" Parent values: "); // for (int i = 0; i < parentVarIndices.length; i++) { // Variable par = (Variable) allVariables.get(parentVarIndices[i]); // System.out.print(" " + par.getName() + " " + parValues[i] + " "); // } // System.out.println(); // for(int m = 0; m < var.getNumSplits(); m++) // System.out.print(" " + m + " " + estimatedCounts[j][row][m]); // System.out.println(); } } // else } // j < numVariables BayesIm outputBayesIm = new MlBayesIm(bayesPm); for (int j = 0; j < nodes.length; j++) { DiscreteVariable var = (DiscreteVariable) allVariables.get(j); String varName = var.getName(); Node varNode = graph.getNode(varName); int varIndex = inputBayesIm.getNodeIndex(varNode); // int[] parentVarIndices = inputBayesIm.getParents(varIndex); int numRows = inputBayesIm.getNumRows(j); // System.out.println("Conditional probabilities for variable " + varName); int numCols = inputBayesIm.getNumColumns(j); if (numRows == 1) { double sum = 0.0; for (int m = 0; m < numCols; m++) { sum += estimatedCounts[j][0][m]; } for (int m = 0; m < numCols; m++) { condProbs[j][0][m] = estimatedCounts[j][0][m] / sum; // System.out.print(" " + condProbs[j][0][m]); outputBayesIm.setProbability(varIndex, 0, m, condProbs[j][0][m]); } // System.out.println(); } else { for (int row = 0; row < numRows; row++) { // int[] parValues = inputBayesIm.getParentValues(varIndex, // row); // int numCols = inputBayesIm.getNumColumns(j); // for (int p = 0; p < parentVarIndices.length; p++) { // Variable par = (Variable) allVariables.get(parentVarIndices[p]); // System.out.print(" " + par.getName() + " " + parValues[p]); // } // double sum = 0.0; // for(int m = 0; m < numCols; m++) // sum += estimatedCounts[j][row][m]; for (int m = 0; m < numCols; m++) { if (estimatedCountsDenom[j][row] != 0.0) { condProbs[j][row][m] = estimatedCounts[j][row][m] / estimatedCountsDenom[j][row]; } else { condProbs[j][row][m] = Double.NaN; } // System.out.print(" " + condProbs[j][row][m]); outputBayesIm.setProbability(varIndex, row, m, condProbs[j][row][m]); } // System.out.println(); } } } return outputBayesIm; }
private void initialize() { DirichletBayesIm prior = DirichletBayesIm.symmetricDirichletIm(bayesPmObs, 0.5); observedIm = DirichletEstimator.estimate(prior, dataSet); // MLBayesEstimator dirichEst = new MLBayesEstimator(); // observedIm = dirichEst.estimate(bayesPmObs, dataSet); // System.out.println("Estimated Bayes IM for Measured Variables: "); // System.out.println(observedIm); // mixedData should be ddsNm with new columns for the latent variables. // Each such column should contain missing data for each case. int numFullCases = dataSet.getNumRows(); List<Node> variables = new LinkedList<Node>(); for (Node node : nodes) { if (node.getNodeType() == NodeType.LATENT) { int numCategories = bayesPm.getNumCategories(node); DiscreteVariable latentVar = new DiscreteVariable(node.getName(), numCategories); variables.add(latentVar); } else { String name = bayesPm.getVariable(node).getName(); Node variable = dataSet.getVariable(name); variables.add(variable); } } DataSet dsMixed = new ColtDataSet(numFullCases, variables); for (int j = 0; j < nodes.length; j++) { if (nodes[j].getNodeType() == NodeType.LATENT) { for (int i = 0; i < numFullCases; i++) { dsMixed.setInt(i, j, -99); } } else { String name = bayesPm.getVariable(nodes[j]).getName(); Node variable = dataSet.getVariable(name); int index = dataSet.getColumn(variable); for (int i = 0; i < numFullCases; i++) { dsMixed.setInt(i, j, dataSet.getInt(i, index)); } } } // System.out.println(dsMixed); mixedData = dsMixed; allVariables = mixedData.getVariables(); // Find the bayes net which is parameterized using mixedData or set randomly when that's // not possible. estimateIM(bayesPm, mixedData); // The following DEBUG section tests a case specified by P. Spirtes // DEBUG TAIL: For use with embayes_l1x1x2x3V3.dat /* Node l1Node = graph.getNode("L1"); //int l1Index = bayesImMixed.getNodeIndex(l1Node); int l1index = estimatedIm.getNodeIndex(l1Node); Node x1Node = graph.getNode("X1"); //int x1Index = bayesImMixed.getNodeIndex(x1Node); int x1Index = estimatedIm.getNodeIndex(x1Node); Node x2Node = graph.getNode("X2"); //int x2Index = bayesImMixed.getNodeIndex(x2Node); int x2Index = estimatedIm.getNodeIndex(x2Node); Node x3Node = graph.getNode("X3"); //int x3Index = bayesImMixed.getNodeIndex(x3Node); int x3Index = estimatedIm.getNodeIndex(x3Node); estimatedIm.setProbability(l1index, 0, 0, 0.5); estimatedIm.setProbability(l1index, 0, 1, 0.5); //bayesImMixed.setProbability(x1Index, 0, 0, 0.33333); //bayesImMixed.setProbability(x1Index, 0, 1, 0.66667); estimatedIm.setProbability(x1Index, 0, 0, 0.6); //p(x1 = 0 | l1 = 0) estimatedIm.setProbability(x1Index, 0, 1, 0.4); //p(x1 = 1 | l1 = 0) estimatedIm.setProbability(x1Index, 1, 0, 0.4); //p(x1 = 0 | l1 = 1) estimatedIm.setProbability(x1Index, 1, 1, 0.6); //p(x1 = 1 | l1 = 1) //bayesImMixed.setProbability(x2Index, 1, 0, 0.66667); //bayesImMixed.setProbability(x2Index, 1, 1, 0.33333); estimatedIm.setProbability(x2Index, 1, 0, 0.4); //p(x2 = 0 | l1 = 1) estimatedIm.setProbability(x2Index, 1, 1, 0.6); //p(x2 = 1 | l1 = 1) estimatedIm.setProbability(x2Index, 0, 0, 0.6); //p(x2 = 0 | l1 = 0) estimatedIm.setProbability(x2Index, 0, 1, 0.4); //p(x2 = 1 | l1 = 0) //bayesImMixed.setProbability(x3Index, 1, 0, 0.66667); //bayesImMixed.setProbability(x3Index, 1, 1, 0.33333); estimatedIm.setProbability(x3Index, 1, 0, 0.4); //p(x3 = 0 | l1 = 1) estimatedIm.setProbability(x3Index, 1, 1, 0.6); //p(x3 = 1 | l1 = 1) estimatedIm.setProbability(x3Index, 0, 0, 0.6); //p(x3 = 0 | l1 = 0) estimatedIm.setProbability(x3Index, 0, 1, 0.4); //p(x3 = 1 | l1 = 0) */ // END of TAIL // System.out.println("bayes IM estimated by estimateIM"); // System.out.println(bayesImMixed); // System.out.println(estimatedIm); estimatedCounts = new double[nodes.length][][]; estimatedCountsDenom = new double[nodes.length][]; condProbs = new double[nodes.length][][]; for (int i = 0; i < nodes.length; i++) { // int numRows = bayesImMixed.getNumRows(i); int numRows = estimatedIm.getNumRows(i); estimatedCounts[i] = new double[numRows][]; estimatedCountsDenom[i] = new double[numRows]; condProbs[i] = new double[numRows][]; // for(int j = 0; j < bayesImMixed.getNumRows(i); j++) { for (int j = 0; j < estimatedIm.getNumRows(i); j++) { // int numCols = bayesImMixed.getNumColumns(i); int numCols = estimatedIm.getNumColumns(i); estimatedCounts[i][j] = new double[numCols]; condProbs[i][j] = new double[numCols]; } } }
public final DataSet filter(DataSet dataSet) { // Why does it have to be discrete? Why can't we simply expand // whatever discrete columns are there and leave the continuous // ones untouched? jdramsey 7/4/2005 // if (!(dataSet.isDiscrete())) { // throw new IllegalArgumentException("Data set must be discrete."); // } List<Node> variables = new LinkedList<>(); // Add all of the variables to the new data set. for (int j = 0; j < dataSet.getNumColumns(); j++) { Node _var = dataSet.getVariable(j); if (!(_var instanceof DiscreteVariable)) { variables.add(_var); continue; } DiscreteVariable variable = (DiscreteVariable) _var; String oldName = variable.getName(); List<String> oldCategories = variable.getCategories(); List<String> newCategories = new LinkedList<>(oldCategories); String newCategory = "Missing"; int _j = 0; while (oldCategories.contains(newCategory)) { newCategory = "Missing" + (++_j); } newCategories.add(newCategory); String newName = oldName + "+"; DiscreteVariable newVariable = new DiscreteVariable(newName, newCategories); variables.add(newVariable); } DataSet newDataSet = new ColtDataSet(dataSet.getNumRows(), variables); // Copy old values to new data set, replacing missing values with new // "MissingValue" categories. for (int j = 0; j < dataSet.getNumColumns(); j++) { Node _var = dataSet.getVariable(j); if (_var instanceof ContinuousVariable) { for (int i = 0; i < dataSet.getNumRows(); i++) { newDataSet.setDouble(i, j, dataSet.getDouble(i, j)); } } else if (_var instanceof DiscreteVariable) { DiscreteVariable variable = (DiscreteVariable) _var; int numCategories = variable.getNumCategories(); for (int i = 0; i < dataSet.getNumRows(); i++) { int value = dataSet.getInt(i, j); if (value == DiscreteVariable.MISSING_VALUE) { newDataSet.setInt(i, j, numCategories); } else { newDataSet.setInt(i, j, value); } } } } return newDataSet; }