// This takes an inordinate amount of time. -jdramsey 20150929
  private int[] getNonMissingRows(Node x, Node y, List<Node> z) {
    //        List<Integer> rows = new ArrayList<Integer>();
    //        I:
    //        for (int i = 0; i < internalData.getNumRows(); i++) {
    //            for (Node node : variablesPerNode.get(x)) {
    //                if (isMissing(node, i)) continue I;
    //            }
    //            for (Node node : variablesPerNode.get(y)) {
    //                if (isMissing(node, i)) continue I;
    //            }
    //            for (Node _z : z) {
    //                for (Node node : variablesPerNode.get(_z)) {
    //                    if (isMissing(node, i)) continue I;
    //                }
    //            }
    //            rows.add(i);
    //        }

    //        int[] _rows = new int[rows.size()];
    //        for (int k = 0; k < rows.size(); k++) _rows[k] = rows.get(k);

    if (_rows == null) {
      _rows = new int[internalData.getNumRows()];
      for (int k = 0; k < _rows.length; k++) _rows[k] = k;

    return _rows;
Ejemplo n.º 2
   * Constructs a test using a given data set. If a data set is provided (that is, a tabular data
   * set), fourth moment statistics can be calculated (p. 160); otherwise, it must be assumed that
   * the data are multivariate Gaussian.
  public DeltaSextadTest(DataSet dataSet) {
    if (dataSet == null) {
      throw new NullPointerException();

    if (!dataSet.isContinuous()) {
      throw new IllegalArgumentException();

    this.cov = new CovarianceMatrix(dataSet);

    List<DataSet> data1 = new ArrayList<DataSet>();
    List<DataSet> data2 = DataUtils.center(data1);

    this.dataSet = data2.get(0);

    this.data = this.dataSet.getDoubleData().transpose().toArray();
    this.N = dataSet.getNumRows();
    this.variables = dataSet.getVariables();
    this.numVars = dataSet.getNumColumns();

    this.variablesHash = new HashMap<Node, Integer>();

    for (int i = 0; i < variables.size(); i++) {
      variablesHash.put(variables.get(i), i);

    this.means = new double[numVars];

    for (int i = 0; i < numVars; i++) {
      means[i] = mean(data[i], N);
Ejemplo n.º 3
   * @param dataSet A discrete data set.
   * @param column the column in question.
   * @return the max value in that column.
  private int maxInColumn(DataSet dataSet, int column) {
    int max = -1;

    for (int i = 0; i < dataSet.getNumRows(); i++) {
      int value = dataSet.getInt(i, column);
      if (value > max) max = value;

    return max;
Ejemplo n.º 4
  /** @return the splitNames selected by the editor. */
  public static DataModel createSplits(DataSet dataSet, SplitCasesParams params) {
    List<Integer> indices = new ArrayList<Integer>(dataSet.getNumRows());
    for (int i = 0; i < dataSet.getNumRows(); i++) {

    if (params.isDataShuffled()) {

    SplitCasesSpec spec = params.getSpec();
    int numSplits = params.getNumSplits();
    int sampleSize = spec.getSampleSize();
    int[] breakpoints = spec.getBreakpoints();
    List<String> splitNames = spec.getSplitNames();

    int[] _breakpoints = new int[breakpoints.length + 2];
    _breakpoints[0] = 0;
    _breakpoints[_breakpoints.length - 1] = sampleSize;
    System.arraycopy(breakpoints, 0, _breakpoints, 1, breakpoints.length);

    DataModelList list = new DataModelList();
    int ncols = dataSet.getNumColumns();
    for (int n = 0; n < numSplits; n++) {
      int _sampleSize = _breakpoints[n + 1] - _breakpoints[n];

      DataSet _data = new ColtDataSet(_sampleSize, dataSet.getVariables());

      for (int i = 0; i < _sampleSize; i++) {
        int oldCase = indices.get(i + _breakpoints[n]);

        for (int j = 0; j < ncols; j++) {
          _data.setObject(i, j, dataSet.getObject(oldCase, j));


    return list;
Ejemplo n.º 5
  private void setDataSet(DataSet dataSet) {
    List<String> _varNames = dataSet.getVariableNames();

    this.variables = dataSet.getVariables();
    this.dataSet = dataSet;
    this.discrete = dataSet.isDiscrete();

    if (!isDiscrete()) {
      this.covariances = new CovarianceMatrix(dataSet);

    this.sampleSize = dataSet.getNumRows();
Ejemplo n.º 6
  /** Creates a cell count table for the given data set. */
  public DataSetProbs(DataSet dataSet) {
    if (dataSet == null) {
      throw new NullPointerException();

    this.dataSet = dataSet;
    dims = new int[dataSet.getNumColumns()];

    for (int i = 0; i < dims.length; i++) {
      DiscreteVariable variable = (DiscreteVariable) dataSet.getVariable(i);
      dims[i] = variable.getNumCategories();

    numRows = dataSet.getNumRows();
  private List<Node> expandVariable(DataSet dataSet, Node node) {
    if (node instanceof ContinuousVariable) {
      return Collections.singletonList(node);

    if (node instanceof DiscreteVariable && ((DiscreteVariable) node).getNumCategories() < 3) {
      return Collections.singletonList(node);

    if (!(node instanceof DiscreteVariable)) {
      throw new IllegalArgumentException();

    List<String> varCats = new ArrayList<String>(((DiscreteVariable) node).getCategories());

    // first category is reference
    List<Node> variables = new ArrayList<Node>();

    for (String cat : varCats) {

      Node newVar;

      do {
        String newVarName = node.getName() + "MULTINOM" + "." + cat;
        newVar = new DiscreteVariable(newVarName, 2);
      } while (dataSet.getVariable(newVar.getName()) != null);


      int newVarIndex = dataSet.getColumn(newVar);
      int numCases = dataSet.getNumRows();

      for (int l = 0; l < numCases; l++) {
        Object dataCell = dataSet.getObject(l, dataSet.getColumn(node));
        int dataCellIndex = ((DiscreteVariable) node).getIndex(dataCell.toString());

        if (dataCellIndex == ((DiscreteVariable) node).getIndex(cat))
          dataSet.setInt(l, newVarIndex, 1);
        else dataSet.setInt(l, newVarIndex, 0);

    return variables;
 private int sampleSize() {
   return dataSet.getNumRows();
  private double[] dependencePvalsLogit(Node x, Node y, List<Node> z) {
    if (!variablesPerNode.containsKey(x)) {
      throw new IllegalArgumentException("Unrecogized node: " + x);

    if (!variablesPerNode.containsKey(y)) {
      throw new IllegalArgumentException("Unrecogized node: " + y);

    for (Node node : z) {
      if (!variablesPerNode.containsKey(node)) {
        throw new IllegalArgumentException("Unrecogized node: " + node);

    List<Double> pValues = new ArrayList<Double>();

    int[] _rows = getNonMissingRows(x, y, z);

    List<Node> yzDumList = new ArrayList<>();
    List<Node> yzList = new ArrayList<>();
    // List<Node> zList = new ArrayList<>();

    for (Node _z : z) {
      // zList.addAll(variablesPerNode.get(_z));

    // double[][] coeffsDep = new double[variablesPerNode.get(x).size()][];
    // DoubleMatrix2D coeffsNull = DoubleFactory2D.dense.make(zList.size(),
    // variablesPerNode.get(x).size());
    // DoubleMatrix2D coeffsDep = DoubleFactory2D.dense.make(yzDumList.size()+1,
    // variablesPerNode.get(x).size());
    double[] sumLnP = new double[yzList.size()];
    for (int i = 0; i < sumLnP.length; i++) sumLnP[i] = 0.0;

    for (int i = 0; i < variablesPerNode.get(x).size(); i++) {
      Node _x = variablesPerNode.get(x).get(i);

      LogisticRegression.Result result1 =
          logisticRegression.regress((DiscreteVariable) _x, yzDumList);

      int n = originalData.getNumRows();
      int k = yzDumList.size();

      // skip intercept at index 0
      int coefIndex = 1;
      for (int j = 0; j < yzList.size(); j++) {
        for (int dum = 0; dum < variablesPerNode.get(yzList.get(j)).size(); dum++) {

          double wald = Math.abs(result1.getCoefs()[coefIndex] / result1.getStdErrs()[coefIndex]);
          // double val = (1.0 - new
          // NormalDistribution(0,1).cumulativeProbability(wald))*2;//two-tailed test
          // double val = 1-result1.getProbs()[i+1];

          // this is exactly the same test as the linear case
          double val = (1.0 - ProbUtils.tCdf(wald, n - k)) * 2;
          // System.out.println(_x.getName() + "\t" + yzDumList.get(coefIndex-1).getName() + "\t" +
          // val + "\t" + (n-k));
          // if(val <= 0) System.out.println("Zero p-val t-test: p " + val + " stat " + wald + " k "
          // + k + " n " + n);
          sumLnP[j] += Math.log(val);

    double[] pVec = new double[sumLnP.length];
    for (int i = 0; i < pVec.length; i++) {
      if (sumLnP[i] == Double.NEGATIVE_INFINITY) pVec[i] = 0.0;
      else {
        int df = 2 * variablesPerNode.get(x).size() * variablesPerNode.get(yzList.get(i)).size();
        pVec[i] = 1.0 - new ChiSquaredDistribution(df).cumulativeProbability(-2 * sumLnP[i]);

    return pVec;
  private void createDiscreteTimeSeriesData() {

    // GIVEN: Continuous data set D, maximum lag m.
    Node[] dataVars = dataSet.getVariables().toArray(new Node[0]);
    int n = dataVars.length;
    int m = getNumLags();

    // LetXi, i = 0,...,n-1, be the variables from the data. Let Xi(t) be
    // the variable Xi at time lag t (before 0), t = 0,...,m.
    Node[][] laggedVars = new Node[m + 1][n];
    Knowledge knowledge = new Knowledge();

    for (int s = 0; s <= m; s++) {
      for (int j = 0; j < n; j++) {
        String name1 = dataVars[j].getName();
        String name2 = name1 + "." + (s + 1);
        laggedVars[s][j] = new DiscreteVariable((DiscreteVariable) dataVars[j]);
        laggedVars[s][j].setCenter(80 * j + 50, 80 * (m - s) + 50);
        knowledge.addToTier(s, laggedVars[s][j].getName());

    // 2. Prepare the data the way you did.
    List<Node> variables = new LinkedList<Node>();

    for (int s = 0; s <= m; s++) {
      for (int i = 0; i < n; i++) {
        int[] rawData = new int[dataSet.getNumRows()];

        for (int j = 0; j < dataSet.getNumRows(); j++) {
          rawData[j] = dataSet.getInt(j, i);

        int size = dataSet.getNumRows();

        int[] laggedRaw = new int[size - m + 1];
        System.arraycopy(rawData, m - s, laggedRaw, 0, size - m + 1);

    DataSet _laggedData = new ColtDataSet(dataSet.getNumRows() - m + 1, variables);

    for (int s = 0; s <= m; s++) {
      for (int i = 0; i < n; i++) {
        int[] rawData = new int[dataSet.getNumRows()];

        for (int j = 0; j < dataSet.getNumRows(); j++) {
          rawData[j] = dataSet.getInt(j, i);

        int size = dataSet.getNumRows();

        int[] laggedRaw = new int[size - m + 1];
        System.arraycopy(rawData, m - s, laggedRaw, 0, size - m + 1);
        int _col = _laggedData.getColumn(laggedVars[s][i]);

        for (int j = 0; j < dataSet.getNumRows(); j++) {
          _laggedData.setInt(j, _col, laggedRaw[j]);

    DataModelList list = new DataModelList();
Ejemplo n.º 11
  protected SemIm estimateCoeffs(SemIm semIm) {

    // System.out.print("\n****************\nCalling 2SLS... ");
    SemGraph semGraph = semIm.getSemPm().getGraph();

    // Get list of fixed measurements that will be kept fixed, and the
    // respective latent variables that are their parents.
    // "X" variables are exogenous, while "Y" variables are endogenous.
    List<Node> ly = new LinkedList<Node>();
    List<Node> lx = new LinkedList<Node>();
    List<Node> my1 = new LinkedList<Node>();
    List<Node> mx1 = new LinkedList<Node>();
    List<Node> observed = new LinkedList<Node>();

    for (Node nodeA : semGraph.getNodes()) {
      if (nodeA.getNodeType() == NodeType.ERROR) {
      if (nodeA.getNodeType() == NodeType.LATENT) {
        if (semGraph.getParents(nodeA).size() == 0) {
        } else {
      } else {
    setFixedNodes(semGraph, mx1, my1);

    // ------------------------------------------------------------------

    // Estimate freeParameters for the latent/latent edges
    for (Node current : ly) {
      if (nodeName != null && !nodeName.equals(current.getName())) {
      // Build Z, the matrix containing the data for the fixed measurements
      // associated with the parents of the getModel (endogenous) latent node
      List<Node> endo_parents_m = new LinkedList<Node>();
      List<Node> exo_parents_m = new LinkedList<Node>();
      List<Node> endo_parents = new LinkedList<Node>();
      List<Node> exo_parents = new LinkedList<Node>();
      Iterator<Node> it_p = semGraph.getParents(current).iterator();
      lNames = new String[lx.size() + ly.size()];
      while (it_p.hasNext()) {
        Node node = it_p.next();
        if (node.getNodeType() == NodeType.ERROR) {
        if (lx.contains(node)) {
          int position = lx.indexOf(node);
        } else {
          int position = ly.indexOf(node);
      Object endp_a_m[] = endo_parents_m.toArray();
      Object exop_a_m[] = exo_parents_m.toArray();
      Object endp_a[] = endo_parents.toArray();
      Object exop_a[] = exo_parents.toArray();
      int n = dataSet.getNumRows(), c = endp_a_m.length + exop_a_m.length;
      if (c == 0) {
      double Z[][] = new double[n][c];
      int count = 0;

      for (int i = 0; i < endp_a_m.length; i++) {
        Node node = (Node) endp_a_m[i];
        String name = node.getName();
        Node variable = dataSet.getVariable(name);
        int colIndex = dataSet.getVariables().indexOf(variable);

        //                Column column = dataSet.getColumnObject(variable);
        //                double column_data[] = (double[]) column.getRawData();

        for (int j = 0; j < n; j++) {
          //                    Z[j][i] = column_data[j];
          Z[j][i] = dataSet.getDouble(j, colIndex);

        lNames[count++] = (endo_parents.get(i)).getName();
      for (int i = 0; i < exop_a_m.length; i++) {
        Node node = (Node) exop_a_m[i];
        String name = node.getName();
        Node variable = dataSet.getVariable(name);
        int colIndex = dataSet.getVariables().indexOf(variable);

        //                Column column = dataSet.getColumnObject(variable);
        //                double column_data[] = (double[]) column.getRawData();

        for (int j = 0; j < n; j++) {
          //                    Z[j][endp_a_m.length + i] = column_data[j];
          Z[j][endp_a_m.length + i] = dataSet.getDouble(j, colIndex);
        lNames[count++] = exo_parents.get(i).getName();
      // Build V, the matrix containing the data for the nonfixed measurements
      // associated with the parents of the getModel (endogenous) latent node
      endo_parents_m = new LinkedList<Node>();
      exo_parents_m = new LinkedList<Node>();
      it_p = semGraph.getParents(current).iterator();
      while (it_p.hasNext()) {
        Node node = it_p.next();
        if (node.getNodeType() == NodeType.ERROR) {
        List<Node> other_measures = new LinkedList<Node>();

        for (Node next : semGraph.getChildren(node)) {
          if (next.getNodeType() == NodeType.MEASURED) {

        if (lx.contains(node)) {
          int position = lx.indexOf(node);
        } else {
          int position = ly.indexOf(node);
      endp_a_m = endo_parents_m.toArray();
      exop_a_m = exo_parents_m.toArray();
      n = dataSet.getNumRows();
      c = endp_a_m.length + exop_a_m.length;
      double V[][] = new double[n][c];
      if (c == 0) {
      for (int i = 0; i < endp_a_m.length; i++) {
        Node node = ((Node) endp_a_m[i]);
        String name = node.getName();
        Node variable = dataSet.getVariable(name);
        int colIndex = dataSet.getVariables().indexOf(variable);

        //                Column column = dataSet.getColumnObject(variable);
        //                double column_data[] = (double[]) column.getRawData();

        for (int j = 0; j < n; j++) {
          //                    V[j][i] = column_data[j];
          V[j][i] = dataSet.getDouble(j, colIndex);
      for (int i = 0; i < exop_a_m.length; i++) {
        Node node = (Node) exop_a_m[i];
        String name = node.getName();
        Node variable = dataSet.getVariable(name);
        int colIndex = dataSet.getVariables().indexOf(variable);

        //                Column column = dataSet.getColumnObject(variable);
        //                double column_data[] = (double[]) column.getRawData();

        for (int j = 0; j < n; j++) {
          //                    V[j][endp_a_m.length + i] = column_data[j];
          V[j][endp_a_m.length + i] = dataSet.getDouble(j, colIndex);
      double yi[] = new double[n];
      if (lx.contains(current)) {
        int position = lx.indexOf(current);
        Node node = mx1.get(position);
        String name = node.getName();
        Node variable = dataSet.getVariable(name);
        int colIndex = dataSet.getVariables().indexOf(variable);

        //                Column column = dataSet.getColumnObject(variable);
        //                System.arraycopy(column.getRawData(), 0, yi, 0, n);

        for (int i = 0; i < n; i++) {
          yi[i] = dataSet.getDouble(i, colIndex);
      } else {
        int position = ly.indexOf(current);
        Node node = my1.get(position);
        String name = node.getName();
        Node variable = dataSet.getVariable(name);
        int colIndex = dataSet.getVariables().indexOf(variable);

        //                System.arraycopy(dataSet.getColumnObject(variable).getRawData(), 0, yi, 0,
        // n);

        for (int i = 0; i < n; i++) {
          yi[i] = dataSet.getDouble(i, colIndex);
      // Build Z_hat
      double Z_hat[][] =
                  MatrixUtils.inverse(MatrixUtils.product(MatrixUtils.transpose(V), V)),
                  MatrixUtils.product(MatrixUtils.transpose(V), Z)));
      A_hat =
              MatrixUtils.inverse(MatrixUtils.product(MatrixUtils.transpose(Z_hat), Z_hat)),
              MatrixUtils.product(MatrixUtils.transpose(Z_hat), yi));
      // Set the edge for the fixed measurement
      int position = ly.indexOf(current);
      semIm.setParamValue(current, my1.get(position), 1.);
      // Set the edge for the latents
      for (int i = 0; i < endp_a.length; i++) {
        semIm.setParamValue((Node) endp_a[i], current, A_hat[i]);
      for (int i = 0; i < exop_a.length; i++) {
        semIm.setParamValue((Node) exop_a[i], current, A_hat[endp_a.length + i]);
      if (nodeName != null && nodeName.equals(current.getName())) {
        computeAsymptLatentCovar(yi, A_hat, Z, Z_hat, dataSet.getNumRows());

    // ------------------------------------------------------------------

    // Estimate freeParameters of the measurement model

    // Set the edges of the fixed measurements of exogenous
    for (Node current : lx) {
      int position = lx.indexOf(current);
      semIm.setParamValue(current, mx1.get(position), 1.);

    for (Node current : observed) {
      if (nodeName != null && !nodeName.equals(current.getName())) {
      if (mx1.contains(current) || my1.contains(current)) {

      // First, get the parent of this observed
      Node current_latent = null;

      for (Node node : semGraph.getParents(current)) {
        if (node.getNodeType() == NodeType.ERROR) {
        current_latent = node;
      Iterator<Node> children = semGraph.getChildren(current_latent).iterator();
      List<Node> other_measures = new LinkedList<Node>();
      Node fixed_measurement;
      while (children.hasNext()) {
        Node next = children.next();
        if ((next.getNodeType() == NodeType.MEASURED) && next != current) {
      if (lx.contains(current_latent)) {
        int position = lx.indexOf(current_latent);
        fixed_measurement = mx1.get(position);
      } else {
        int position = ly.indexOf(current_latent);
        fixed_measurement = my1.get(position);
      // Regress other_measures over the fixed measurement x1 (y1) correspondent
      // to the measurement variable that is being evaluated
      int n = dataSet.getNumRows(), c = other_measures.size();
      if (c == 0) {
      double Z[][] = new double[n][c];
      for (int i = 0; i < c; i++) {
        Node variable = dataSet.getVariable((other_measures.get(i)).getName());
        int varIndex = dataSet.getVariables().indexOf(variable);

        //                Column column = dataSet.getColumnObject(variable);
        //                double column_data[] = (double[]) column.getRawData();

        for (int j = 0; j < n; j++) {
          //                    Z[j][i] = column_data[j];
          Z[j][i] = dataSet.getDouble(varIndex, j);

      // Build C, the column matrix containing the data for the fixed
      // measurement associated with the only latent parent of the getModel
      // observed node (as assumed by the structure of our measurement model).
      Node variable = dataSet.getVariable(fixed_measurement.getName());
      int colIndex = dataSet.getVariables().indexOf(variable);
      //            Column column = dataSet.getColumnObject(variable);
      //            double C[] = (double[]) column.getRawData();

      double[] C = new double[dataSet.getNumRows()];

      for (int i = 0; i < dataSet.getNumRows(); i++) {
        C[i] = dataSet.getDouble(colIndex, i);

      // Build V, the matrix containing the data for the other measurements
      // associated with the parents of the (latent) parent of getModel
      // observed node. The only difference with respect to the estimation
      // of the within-latent coefficients is that here we only include
      // the other measurements attached to the parent of the getModel node,
      // assuming that the error term of the getModel node is independent
      // of the error term of the others and that each measurement is
      // taken with respect to only one latent.
      n = dataSet.getNumRows();
      c = other_measures.size();
      double V[][] = new double[n][c];
      for (int i = 0; i < c; i++) {
        Node variable2 = dataSet.getVariable((other_measures.get(i)).getName());
        int var2index = dataSet.getVariables().indexOf(variable2);

        //                Column column = dataSet.getColumnObject(variable2);
        //                double column_data[] = (double[]) column.getRawData();

        for (int j = 0; j < n; j++) {
          //                    V[j][i] = column_data[j];
          V[j][i] = dataSet.getDouble(j, var2index);
      double yi[] = new double[n];
      Node variable3 = dataSet.getVariable((current).getName());
      int var3Index = dataSet.getVariables().indexOf(variable3);

      for (int i = 0; i < n; i++) {
        yi[i] = dataSet.getDouble(i, var3Index);

      //            Object rawData = dataSet.getColumnObject(variable3).getRawData();
      //            System.arraycopy(rawData, 0, yi, 0, n);
      double C_hat[] =
                  MatrixUtils.inverse(MatrixUtils.product(MatrixUtils.transpose(V), V)),
                  MatrixUtils.product(MatrixUtils.transpose(V), C)));
      double A_hat =
              MatrixUtils.scalarProduct(1. / MatrixUtils.innerProduct(C_hat, C_hat), C_hat), yi);
      // Set the edge for the getModel measurement
      semIm.setParamValue(current_latent, current, A_hat);

    return semIm;
   * This method takes an instantiated Bayes net (BayesIm) whose graph include all the variables
   * (observed and latent) and computes estimated counts using the data in the DataSet mixedData.
   * The counts that are estimated correspond to cells in the conditional probability tables of the
   * Bayes net. The outermost loop (indexed by j) is over the set of variables. If the variable has
   * no parents, each case in the dataset is examined and the count for the observed value of the
   * variables is increased by 1.0; if the value of the variable is missing the marginal
   * probabilities its values given the values of the variables that are available for that case are
   * used to increment the corresponding estimated counts. If a variable has parents then there is a
   * loop which steps through all possible sets of values of its parents. This loop is indexed by
   * the variable "row". Each case in the dataset is examined. It the variable and all its parents
   * have values in the case the corresponding estimated counts are incremented by 1.0. If the
   * variable or any of its parents have missing values, the joint marginal is computed for the
   * variable and the set of values of its parents corresponding to "row" and the corresponding
   * estimated counts are incremented by the appropriate probability. The estimated counts are
   * stored in the double[][][] array estimatedCounts. The count (possibly fractional) of the number
   * of times each combination of parent values occurs is stored in the double[][] array
   * estimatedCountsDenom. These two arrays are used to compute the estimated conditional
   * probabilities of the output Bayes net.
  private BayesIm expectation(BayesIm inputBayesIm) {
    // System.out.println("Entered method expectation.");

    int numCases = mixedData.getNumRows();
    // StoredCellEstCounts estCounts = new StoredCellEstCounts(variables);

    int numVariables = allVariables.size();
    RowSummingExactUpdater rseu = new RowSummingExactUpdater(inputBayesIm);

    for (int j = 0; j < numVariables; j++) {
      DiscreteVariable var = (DiscreteVariable) allVariables.get(j);
      String varName = var.getName();
      Node varNode = graph.getNode(varName);
      int varIndex = inputBayesIm.getNodeIndex(varNode);
      int[] parentVarIndices = inputBayesIm.getParents(varIndex);
      // System.out.println("graph = " + graph);

      // for(int col = 0; col < var.getNumSplits(); col++)
      //    System.out.println("Category " + col + " = " + var.getCategory(col));

      // System.out.println("Updating estimated counts for node " + varName);
      // This segment is for variables with no parents:
      if (parentVarIndices.length == 0) {
        // System.out.println("No parents");
        for (int col = 0; col < var.getNumCategories(); col++) {
          estimatedCounts[j][0][col] = 0.0;

        for (int i = 0; i < numCases; i++) {
          // System.out.println("Case " + i);
          // If this case has a value for var
          if (mixedData.getInt(i, j) != -99) {
            estimatedCounts[j][0][mixedData.getInt(i, j)] += 1.0;
            // System.out.println("Adding 1.0 to " + varName +
            //        " row 0 category " + mixedData[j][i]);
          } else {
            // find marginal probability, given obs data in this case, p(v=0)
            Evidence evidenceThisCase = Evidence.tautology(inputBayesIm);
            boolean existsEvidence = false;

            // Define evidence for updating by using the values of the other vars.
            for (int k = 0; k < numVariables; k++) {
              if (k == j) {
              Node otherVar = allVariables.get(k);
              if (mixedData.getInt(i, k) == -99) {
              existsEvidence = true;
              String otherVarName = otherVar.getName();
              Node otherNode = graph.getNode(otherVarName);
              int otherIndex = inputBayesIm.getNodeIndex(otherNode);

              evidenceThisCase.getProposition().setCategory(otherIndex, mixedData.getInt(i, k));

            if (!existsEvidence) {
              continue; // No other variable contained useful data


            for (int m = 0; m < var.getNumCategories(); m++) {
              estimatedCounts[j][0][m] += rseu.getMarginal(varIndex, m);
              // System.out.println("Adding " + p + " to " + varName +
              //        " row 0 category " + m);

              // find marginal probability, given obs data in this case, p(v=1)
              // estimatedCounts[j][0][1] += 0.5;

        // Print estimated counts:
        // System.out.println("Estimated counts:  ");

        // Print counts for each value of this variable with no parents.
        // for(int m = 0; m < var.getNumSplits(); m++)
        //    System.out.print("    " + m + " " + estimatedCounts[j][0][m]);
        // System.out.println();
      } else { // For variables with parents:
        int numRows = inputBayesIm.getNumRows(varIndex);
        for (int row = 0; row < numRows; row++) {
          int[] parValues = inputBayesIm.getParentValues(varIndex, row);
          estimatedCountsDenom[varIndex][row] = 0.0;
          for (int col = 0; col < var.getNumCategories(); col++) {
            estimatedCounts[varIndex][row][col] = 0.0;

          for (int i = 0; i < numCases; i++) {
            // for a case where the parent values = parValues increment the estCount

            boolean parentMatch = true;

            for (int p = 0; p < parentVarIndices.length; p++) {
              if (parValues[p] != mixedData.getInt(i, parentVarIndices[p])
                  && mixedData.getInt(i, parentVarIndices[p]) != -99) {
                parentMatch = false;

            if (!parentMatch) {
              continue; // Not a matching case; go to next.

            boolean parentMissing = false;
            for (int parentVarIndice : parentVarIndices) {
              if (mixedData.getInt(i, parentVarIndice) == -99) {
                parentMissing = true;

            if (mixedData.getInt(i, j) != -99 && !parentMissing) {
              estimatedCounts[j][row][mixedData.getInt(i, j)] += 1.0;
              estimatedCountsDenom[j][row] += 1.0;
              continue; // Next case

            // for a case with missing data (either var or one of its parents)
            // compute the joint marginal
            // distribution for var & this combination of values of its parents
            // and update the estCounts accordingly

            // To compute marginals create the evidence
            boolean existsEvidence = false;

            Evidence evidenceThisCase = Evidence.tautology(inputBayesIm);

            // "evidenceVars" not used.
            //                        List<String> evidenceVars = new LinkedList<String>();
            //                        for (int k = 0; k < numVariables; k++) {
            //                            //if(k == j) continue;
            //                            Variable otherVar = allVariables.get(k);
            //                            if (mixedData.getInt(i, k) == -99) {
            //                                continue;
            //                            }
            //                            existsEvidence = true;
            //                            String otherVarName = otherVar.getName();
            //                            Node otherNode = graph.getNode(otherVarName);
            //                            int otherIndex = inputBayesIm.getNodeIndex(
            //                                    otherNode);
            //                            evidenceThisCase.getProposition().setCategory(
            //                                    otherIndex, mixedData.getInt(i, k));
            //                            evidenceVars.add(otherVarName);
            //                        }

            if (!existsEvidence) {


            estimatedCountsDenom[j][row] += rseu.getJointMarginal(parentVarIndices, parValues);

            int[] parPlusChildIndices = new int[parentVarIndices.length + 1];
            int[] parPlusChildValues = new int[parentVarIndices.length + 1];

            parPlusChildIndices[0] = varIndex;
            for (int pc = 1; pc < parPlusChildIndices.length; pc++) {
              parPlusChildIndices[pc] = parentVarIndices[pc - 1];
              parPlusChildValues[pc] = parValues[pc - 1];

            for (int m = 0; m < var.getNumCategories(); m++) {

              parPlusChildValues[0] = m;

              if(varName.equals("X1") && i == 0 ) {
                  System.out.println("Calling getJointMarginal with parvalues");
                  for(int k = 0; k < parPlusChildIndices.length; k++) {
                      int pIndex = parPlusChildIndices[k];
                      Node pNode = inputBayesIm.getNode(pIndex);
                      String pName = pNode.getName();
                      System.out.println(pName + " " + parPlusChildValues[k]);

              if(varName.equals("X1") && i == 0 ) {
                  System.out.println("Evidence = " + evidenceThisCase);
                  //int[] vars = {l1Index, x1Index};
                  Node nodex1 = inputBayesIm.getNode("X1");
                  int x1Index = inputBayesIm.getNodeIndex(nodex1);
                  Node nodel1 = inputBayesIm.getNode("L1");
                  int l1Index = inputBayesIm.getNodeIndex(nodel1);

                  int[] vars = {l1Index, x1Index};
                  int[] vals = {0, 0};
                  double ptest = rseu.getJointMarginal(vars, vals);
                  System.out.println("Joint marginal (X1=0, L1 = 0) = " + p);

              estimatedCounts[j][row][m] +=
                  rseu.getJointMarginal(parPlusChildIndices, parPlusChildValues);

              // System.out.println("Case " + i + " parent values ");
              // for (int pp = 0; pp < parentVarIndices.length; pp++) {
              //    Variable par = (Variable) allVariables.get(parentVarIndices[pp]);
              //    System.out.print("    " + par.getName() + " " + parValues[pp]);
              // }

              // System.out.println();
              // System.out.println("Adding " + p + " to " + varName +
              //        " row " + row + " category " + m);

            // }

          // Print estimated counts:
          // System.out.println("Estimated counts:  ");
          // System.out.println("    Parent values:  ");
          // for (int i = 0; i < parentVarIndices.length; i++) {
          //    Variable par = (Variable) allVariables.get(parentVarIndices[i]);
          //    System.out.print("    " + par.getName() + " " + parValues[i] + "    ");
          // }
          // System.out.println();

          // for(int m = 0; m < var.getNumSplits(); m++)
          //    System.out.print("    " + m + " " + estimatedCounts[j][row][m]);
          // System.out.println();

      } // else
    } // j < numVariables

    BayesIm outputBayesIm = new MlBayesIm(bayesPm);

    for (int j = 0; j < nodes.length; j++) {

      DiscreteVariable var = (DiscreteVariable) allVariables.get(j);
      String varName = var.getName();
      Node varNode = graph.getNode(varName);
      int varIndex = inputBayesIm.getNodeIndex(varNode);
      //            int[] parentVarIndices = inputBayesIm.getParents(varIndex);

      int numRows = inputBayesIm.getNumRows(j);
      // System.out.println("Conditional probabilities for variable " + varName);

      int numCols = inputBayesIm.getNumColumns(j);
      if (numRows == 1) {
        double sum = 0.0;
        for (int m = 0; m < numCols; m++) {
          sum += estimatedCounts[j][0][m];

        for (int m = 0; m < numCols; m++) {
          condProbs[j][0][m] = estimatedCounts[j][0][m] / sum;
          // System.out.print("  " + condProbs[j][0][m]);
          outputBayesIm.setProbability(varIndex, 0, m, condProbs[j][0][m]);
        // System.out.println();
      } else {

        for (int row = 0; row < numRows; row++) {
          //                    int[] parValues = inputBayesIm.getParentValues(varIndex,
          //                            row);
          // int numCols = inputBayesIm.getNumColumns(j);

          // for (int p = 0; p < parentVarIndices.length; p++) {
          //    Variable par = (Variable) allVariables.get(parentVarIndices[p]);
          //    System.out.print("    " + par.getName() + " " + parValues[p]);
          // }

          // double sum = 0.0;
          // for(int m = 0; m < numCols; m++)
          //    sum += estimatedCounts[j][row][m];

          for (int m = 0; m < numCols; m++) {
            if (estimatedCountsDenom[j][row] != 0.0) {
              condProbs[j][row][m] = estimatedCounts[j][row][m] / estimatedCountsDenom[j][row];
            } else {
              condProbs[j][row][m] = Double.NaN;
            // System.out.print("  " + condProbs[j][row][m]);
            outputBayesIm.setProbability(varIndex, row, m, condProbs[j][row][m]);
          // System.out.println();


    return outputBayesIm;
  private void initialize() {
    DirichletBayesIm prior = DirichletBayesIm.symmetricDirichletIm(bayesPmObs, 0.5);
    observedIm = DirichletEstimator.estimate(prior, dataSet);

    //        MLBayesEstimator dirichEst = new MLBayesEstimator();
    //        observedIm = dirichEst.estimate(bayesPmObs, dataSet);

    //        System.out.println("Estimated Bayes IM for Measured Variables:  ");
    //        System.out.println(observedIm);

    // mixedData should be ddsNm with new columns for the latent variables.
    // Each such column should contain missing data for each case.

    int numFullCases = dataSet.getNumRows();
    List<Node> variables = new LinkedList<Node>();

    for (Node node : nodes) {
      if (node.getNodeType() == NodeType.LATENT) {
        int numCategories = bayesPm.getNumCategories(node);
        DiscreteVariable latentVar = new DiscreteVariable(node.getName(), numCategories);
      } else {
        String name = bayesPm.getVariable(node).getName();
        Node variable = dataSet.getVariable(name);

    DataSet dsMixed = new ColtDataSet(numFullCases, variables);

    for (int j = 0; j < nodes.length; j++) {
      if (nodes[j].getNodeType() == NodeType.LATENT) {
        for (int i = 0; i < numFullCases; i++) {
          dsMixed.setInt(i, j, -99);
      } else {
        String name = bayesPm.getVariable(nodes[j]).getName();
        Node variable = dataSet.getVariable(name);
        int index = dataSet.getColumn(variable);

        for (int i = 0; i < numFullCases; i++) {
          dsMixed.setInt(i, j, dataSet.getInt(i, index));

    //        System.out.println(dsMixed);

    mixedData = dsMixed;
    allVariables = mixedData.getVariables();

    // Find the bayes net which is parameterized using mixedData or set randomly when that's
    // not possible.
    estimateIM(bayesPm, mixedData);

    // The following DEBUG section tests a case specified by P. Spirtes
    // DEBUG TAIL:   For use with embayes_l1x1x2x3V3.dat
    Node l1Node = graph.getNode("L1");
    //int l1Index = bayesImMixed.getNodeIndex(l1Node);
    int l1index = estimatedIm.getNodeIndex(l1Node);
    Node x1Node = graph.getNode("X1");
    //int x1Index = bayesImMixed.getNodeIndex(x1Node);
    int x1Index = estimatedIm.getNodeIndex(x1Node);
    Node x2Node = graph.getNode("X2");
    //int x2Index = bayesImMixed.getNodeIndex(x2Node);
    int x2Index = estimatedIm.getNodeIndex(x2Node);
    Node x3Node = graph.getNode("X3");
    //int x3Index = bayesImMixed.getNodeIndex(x3Node);
    int x3Index = estimatedIm.getNodeIndex(x3Node);

    estimatedIm.setProbability(l1index, 0, 0, 0.5);
    estimatedIm.setProbability(l1index, 0, 1, 0.5);

    //bayesImMixed.setProbability(x1Index, 0, 0, 0.33333);
    //bayesImMixed.setProbability(x1Index, 0, 1, 0.66667);
    estimatedIm.setProbability(x1Index, 0, 0, 0.6);      //p(x1 = 0 | l1 = 0)
    estimatedIm.setProbability(x1Index, 0, 1, 0.4);      //p(x1 = 1 | l1 = 0)
    estimatedIm.setProbability(x1Index, 1, 0, 0.4);      //p(x1 = 0 | l1 = 1)
    estimatedIm.setProbability(x1Index, 1, 1, 0.6);      //p(x1 = 1 | l1 = 1)

    //bayesImMixed.setProbability(x2Index, 1, 0, 0.66667);
    //bayesImMixed.setProbability(x2Index, 1, 1, 0.33333);
    estimatedIm.setProbability(x2Index, 1, 0, 0.4);      //p(x2 = 0 | l1 = 1)
    estimatedIm.setProbability(x2Index, 1, 1, 0.6);      //p(x2 = 1 | l1 = 1)
    estimatedIm.setProbability(x2Index, 0, 0, 0.6);      //p(x2 = 0 | l1 = 0)
    estimatedIm.setProbability(x2Index, 0, 1, 0.4);      //p(x2 = 1 | l1 = 0)

    //bayesImMixed.setProbability(x3Index, 1, 0, 0.66667);
    //bayesImMixed.setProbability(x3Index, 1, 1, 0.33333);
    estimatedIm.setProbability(x3Index, 1, 0, 0.4);      //p(x3 = 0 | l1 = 1)
    estimatedIm.setProbability(x3Index, 1, 1, 0.6);      //p(x3 = 1 | l1 = 1)
    estimatedIm.setProbability(x3Index, 0, 0, 0.6);      //p(x3 = 0 | l1 = 0)
    estimatedIm.setProbability(x3Index, 0, 1, 0.4);      //p(x3 = 1 | l1 = 0)
    // END of TAIL

    // System.out.println("bayes IM estimated by estimateIM");
    // System.out.println(bayesImMixed);
    // System.out.println(estimatedIm);

    estimatedCounts = new double[nodes.length][][];
    estimatedCountsDenom = new double[nodes.length][];
    condProbs = new double[nodes.length][][];

    for (int i = 0; i < nodes.length; i++) {
      // int numRows = bayesImMixed.getNumRows(i);
      int numRows = estimatedIm.getNumRows(i);
      estimatedCounts[i] = new double[numRows][];
      estimatedCountsDenom[i] = new double[numRows];
      condProbs[i] = new double[numRows][];
      // for(int j = 0; j < bayesImMixed.getNumRows(i); j++) {
      for (int j = 0; j < estimatedIm.getNumRows(i); j++) {
        // int numCols = bayesImMixed.getNumColumns(i);
        int numCols = estimatedIm.getNumColumns(i);
        estimatedCounts[i][j] = new double[numCols];
        condProbs[i][j] = new double[numCols];
Ejemplo n.º 14
  public final DataSet filter(DataSet dataSet) {

    // Why does it have to be discrete? Why can't we simply expand
    // whatever discrete columns are there and leave the continuous
    // ones untouched? jdramsey 7/4/2005
    //        if (!(dataSet.isDiscrete())) {
    //            throw new IllegalArgumentException("Data set must be discrete.");
    //        }

    List<Node> variables = new LinkedList<>();

    // Add all of the variables to the new data set.
    for (int j = 0; j < dataSet.getNumColumns(); j++) {
      Node _var = dataSet.getVariable(j);

      if (!(_var instanceof DiscreteVariable)) {

      DiscreteVariable variable = (DiscreteVariable) _var;

      String oldName = variable.getName();
      List<String> oldCategories = variable.getCategories();
      List<String> newCategories = new LinkedList<>(oldCategories);

      String newCategory = "Missing";
      int _j = 0;

      while (oldCategories.contains(newCategory)) {
        newCategory = "Missing" + (++_j);

      String newName = oldName + "+";
      DiscreteVariable newVariable = new DiscreteVariable(newName, newCategories);


    DataSet newDataSet = new ColtDataSet(dataSet.getNumRows(), variables);

    // Copy old values to new data set, replacing missing values with new
    // "MissingValue" categories.
    for (int j = 0; j < dataSet.getNumColumns(); j++) {
      Node _var = dataSet.getVariable(j);

      if (_var instanceof ContinuousVariable) {
        for (int i = 0; i < dataSet.getNumRows(); i++) {
          newDataSet.setDouble(i, j, dataSet.getDouble(i, j));
      } else if (_var instanceof DiscreteVariable) {
        DiscreteVariable variable = (DiscreteVariable) _var;
        int numCategories = variable.getNumCategories();

        for (int i = 0; i < dataSet.getNumRows(); i++) {
          int value = dataSet.getInt(i, j);

          if (value == DiscreteVariable.MISSING_VALUE) {
            newDataSet.setInt(i, j, numCategories);
          } else {
            newDataSet.setInt(i, j, value);

    return newDataSet;