Exemple #1
0
  /**
   * Computes the distance between two instances (without previous normalization)
   *
   * @param i First instance
   * @param j Second instance
   * @return The Euclidean distance between i and j
   */
  private double distance(Instance i, Instance j) {
    double dist = 0;
    int in = 0;
    int out = 0;

    for (int l = 0; l < nvariables; l++) {
      Attribute a = Attributes.getAttribute(l);

      direccion = a.getDirectionAttribute();
      tipo = a.getType();

      if (direccion == Attribute.INPUT) {
        if (tipo != Attribute.NOMINAL && !i.getInputMissingValues(in)) {
          // real value, apply euclidean distance
          dist +=
              (i.getInputRealValues(in) - j.getInputRealValues(in))
                  * (i.getInputRealValues(in) - j.getInputRealValues(in));
        } else {
          if (!i.getInputMissingValues(in)
              && i.getInputNominalValues(in) != j.getInputNominalValues(in)) dist += 1;
        }
        in++;
      } else {
        if (direccion == Attribute.OUTPUT) {
          if (tipo != Attribute.NOMINAL && !i.getOutputMissingValues(out)) {
            dist +=
                (i.getOutputRealValues(out) - j.getOutputRealValues(out))
                    * (i.getOutputRealValues(out) - j.getOutputRealValues(out));
          } else {
            if (!i.getOutputMissingValues(out)
                && i.getOutputNominalValues(out) != j.getOutputNominalValues(out)) dist += 1;
          }
          out++;
        }
      }
    }
    return dist;
  }
Exemple #2
0
  /** Process the training and test files provided in the parameters file to the constructor. */
  public void process() {
    // declarations
    double[] outputs;
    double[] outputs2;
    Instance neighbor;
    double dist, mean;
    int actual;
    Randomize rnd = new Randomize();
    Instance ex;
    gCenter kmeans = null;
    int iterations = 0;
    double E;
    double prevE;
    int totalMissing = 0;
    boolean allMissing = true;

    rnd.setSeed(semilla);
    // PROCESS
    try {

      // Load in memory a dataset that contains a classification problem
      IS.readSet(input_train_name, true);
      int in = 0;
      int out = 0;

      ndatos = IS.getNumInstances();
      nvariables = Attributes.getNumAttributes();
      nentradas = Attributes.getInputNumAttributes();
      nsalidas = Attributes.getOutputNumAttributes();

      X = new String[ndatos][nvariables]; // matrix with transformed data
      kmeans = new gCenter(K, ndatos, nvariables);

      timesSeen = new FreqList[nvariables];
      mostCommon = new String[nvariables];

      // first, we choose k 'means' randomly from all
      // instances
      totalMissing = 0;
      for (int i = 0; i < ndatos; i++) {
        Instance inst = IS.getInstance(i);
        if (inst.existsAnyMissingValue()) totalMissing++;
      }
      if (totalMissing == ndatos) allMissing = true;
      else allMissing = false;
      for (int numMeans = 0; numMeans < K; numMeans++) {
        do {
          actual = (int) (ndatos * rnd.Rand());
          ex = IS.getInstance(actual);
        } while (ex.existsAnyMissingValue() && !allMissing);

        kmeans.copyCenter(ex, numMeans);
      }

      // now, iterate adjusting clusters' centers and
      // instances to them
      prevE = 0;
      iterations = 0;
      do {
        for (int i = 0; i < ndatos; i++) {
          Instance inst = IS.getInstance(i);

          kmeans.setClusterOf(inst, i);
        }
        // set new centers
        kmeans.recalculateCenters(IS);
        // compute RMSE
        E = 0;
        for (int i = 0; i < ndatos; i++) {
          Instance inst = IS.getInstance(i);

          E += kmeans.distance(inst, kmeans.getClusterOf(i));
        }
        iterations++;
        // System.out.println(iterations+"\t"+E);
        if (Math.abs(prevE - E) == 0) iterations = maxIter;
        else prevE = E;
      } while (E > minError && iterations < maxIter);
      for (int i = 0; i < ndatos; i++) {
        Instance inst = IS.getInstance(i);

        in = 0;
        out = 0;

        for (int j = 0; j < nvariables; j++) {
          Attribute a = Attributes.getAttribute(j);

          direccion = a.getDirectionAttribute();
          tipo = a.getType();

          if (direccion == Attribute.INPUT) {
            if (tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in)) {
              X[i][j] = new String(String.valueOf(inst.getInputRealValues(in)));
            } else {
              if (!inst.getInputMissingValues(in)) X[i][j] = inst.getInputNominalValues(in);
              else {
                actual = kmeans.getClusterOf(i);
                X[i][j] = new String(kmeans.valueAt(actual, j));
              }
            }
            in++;
          } else {
            if (direccion == Attribute.OUTPUT) {
              if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)) {
                X[i][j] = new String(String.valueOf(inst.getOutputRealValues(out)));
              } else {
                if (!inst.getOutputMissingValues(out)) X[i][j] = inst.getOutputNominalValues(out);
                else {
                  actual = kmeans.getClusterOf(i);
                  X[i][j] = new String(kmeans.valueAt(actual, j));
                }
              }
              out++;
            }
          }
        }
      }
    } catch (Exception e) {
      System.out.println("Dataset exception = " + e);
      e.printStackTrace();
      System.exit(-1);
    }
    write_results(output_train_name);
    /** ************************************************************************************ */
    // does a test file associated exist?
    if (input_train_name.compareTo(input_test_name) != 0) {
      try {

        // Load in memory a dataset that contains a classification problem
        IStest.readSet(input_test_name, false);
        int in = 0;
        int out = 0;

        ndatos = IStest.getNumInstances();
        nvariables = Attributes.getNumAttributes();
        nentradas = Attributes.getInputNumAttributes();
        nsalidas = Attributes.getOutputNumAttributes();

        for (int i = 0; i < ndatos; i++) {
          Instance inst = IStest.getInstance(i);

          in = 0;
          out = 0;

          for (int j = 0; j < nvariables; j++) {
            Attribute a = Attributes.getAttribute(j);

            direccion = a.getDirectionAttribute();
            tipo = a.getType();

            if (direccion == Attribute.INPUT) {
              if (tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in)) {
                X[i][j] = new String(String.valueOf(inst.getInputRealValues(in)));
              } else {
                if (!inst.getInputMissingValues(in)) X[i][j] = inst.getInputNominalValues(in);
                else {
                  actual = kmeans.getClusterOf(i);
                  X[i][j] = new String(kmeans.valueAt(actual, j));
                }
              }
              in++;
            } else {
              if (direccion == Attribute.OUTPUT) {
                if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)) {
                  X[i][j] = new String(String.valueOf(inst.getOutputRealValues(out)));
                } else {
                  if (!inst.getOutputMissingValues(out)) X[i][j] = inst.getOutputNominalValues(out);
                  else {
                    actual = kmeans.getClusterOf(i);
                    X[i][j] = new String(kmeans.valueAt(actual, j));
                  }
                }
                out++;
              }
            }
          }
        }
      } catch (Exception e) {
        System.out.println("Dataset exception = " + e);
        e.printStackTrace();
        System.exit(-1);
      }
      write_results(output_test_name);
    }
  }
  /** Process the training and test files provided in the parameters file to the constructor. */
  public void process() {
    double[] outputs;
    double[] outputs2;
    try {
      FileWriter file_write = new FileWriter(output_train_name);

      try {

        // Load in memory a dataset that contains a classification problem
        IS.readSet(input_train_name, true);
        int in = 0;
        int out = 0;
        int in2 = 0;
        int out2 = 0;
        int lastMissing = -1;
        boolean fin = false;
        boolean stepNext = false;

        ndatos = IS.getNumInstances();
        nvariables = Attributes.getNumAttributes();
        nentradas = Attributes.getInputNumAttributes();
        nsalidas = Attributes.getOutputNumAttributes();

        String[] row = null;
        X = new Vector[ndatos]; // matrix with transformed data
        for (int i = 0; i < ndatos; i++) X[i] = new Vector();

        timesSeen = new FreqList[nvariables];
        mostCommon = new String[nvariables];

        file_write.write(IS.getHeader());

        // now, print the normalized data
        file_write.write("@data\n");

        // now, search for missed data, and replace them with
        // the most common value

        for (int i = 0; i < ndatos; i++) {
          Instance inst = IS.getInstance(i);
          in = 0;
          out = 0;
          row = new String[nvariables];

          for (int j = 0; j < nvariables; j++) {
            Attribute a = Attributes.getAttribute(j);

            direccion = a.getDirectionAttribute();
            tipo = a.getType();

            if (direccion == Attribute.INPUT) {
              if (tipo != Attribute.NOMINAL && !inst.existsAnyMissingValue()) {
                row[j] = new String(String.valueOf(inst.getInputRealValues(in)));
              } else {
                if (!inst.existsAnyMissingValue()) row[j] = inst.getInputNominalValues(in);
                else {
                  // missing data
                  outputs = inst.getAllOutputValues();
                  in2 = 0;
                  out2 = 0;
                  for (int attr = 0; attr < nvariables; attr++) {
                    Attribute b = Attributes.getAttribute(attr);
                    direccion = b.getDirectionAttribute();
                    tipo = b.getType();
                    if (direccion == Attribute.INPUT) {
                      if (tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in2)) {
                        row[attr] = new String(String.valueOf(inst.getInputRealValues(in2)));
                      } else {
                        if (!inst.getInputMissingValues(in2))
                          row[attr] = inst.getInputNominalValues(in2);
                      }
                      in2++;
                    } else {
                      if (direccion == Attribute.OUTPUT) {
                        if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out2)) {
                          row[attr] = new String(String.valueOf(inst.getOutputRealValues(out2)));
                        } else {
                          if (!inst.getOutputMissingValues(out2))
                            row[attr] = inst.getOutputNominalValues(out2);
                        }
                        out2++;
                      }
                    }
                  }
                  // make frecuencies  for each attribute
                  for (int attr = 0; attr < nvariables; attr++) {
                    Attribute b = Attributes.getAttribute(attr);

                    direccion = b.getDirectionAttribute();
                    tipo = b.getType();
                    if (direccion == Attribute.INPUT && inst.getInputMissingValues(attr)) {
                      lastMissing = attr;
                      timesSeen[attr] = new FreqList();
                      for (int m = 0; m < ndatos; m++) {
                        Instance inst2 = IS.getInstance(m);
                        outputs2 = inst2.getAllOutputValues();
                        boolean sameClass = true;
                        // are they same concept instances??
                        for (int k = 0; k < nsalidas && sameClass; k++)
                          if (outputs[k] != outputs2[k]) sameClass = false;
                        if (sameClass) {
                          if (tipo != Attribute.NOMINAL && !inst2.getInputMissingValues(attr)) {
                            timesSeen[attr].AddElement(
                                new String(String.valueOf(inst2.getInputRealValues(attr))));

                          } else {
                            if (!inst2.getInputMissingValues(attr)) {
                              timesSeen[attr].AddElement(inst2.getInputNominalValues(attr));
                            }
                          }
                        }
                      }
                    }
                  }
                  for (int attr = 0; attr < nvariables; attr++) {
                    if (direccion == Attribute.INPUT && inst.getInputMissingValues(attr)) {
                      timesSeen[attr].reset();
                    }
                  }
                  fin = false;
                  stepNext = false;
                  while (!fin) {
                    in2 = 0;
                    for (int attr = 0; attr < nvariables && !fin; attr++) {
                      Attribute b = Attributes.getAttribute(attr);

                      direccion = b.getDirectionAttribute();
                      tipo = b.getType();
                      if (direccion == Attribute.INPUT && inst.getInputMissingValues(in2)) {
                        if (stepNext) {
                          timesSeen[attr].iterate();
                          stepNext = false;
                        }
                        if (timesSeen[attr].outOfBounds()) {
                          stepNext = true;
                          if (attr == lastMissing) fin = true;
                          timesSeen[attr].reset();
                        }
                        if (!fin)
                          row[attr] =
                              ((ValueFreq) timesSeen[attr].getCurrent())
                                  .getValue(); // replace missing data
                      }
                      in2++;
                    }
                    if (!fin) {
                      stepNext = true;
                      file_write.write(row[0]);
                      for (int y = 1; y < nvariables; y++) {
                        file_write.write("," + row[y]);
                      }
                      file_write.write("\n");
                      // X[i].addElement(row);
                      // row = (String[])row.clone();
                    }
                  }
                }
              }
              in++;
            } else {
              if (direccion == Attribute.OUTPUT) {
                if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)) {
                  row[j] = new String(String.valueOf(inst.getOutputRealValues(out)));
                } else {
                  if (!inst.getOutputMissingValues(out)) row[j] = inst.getOutputNominalValues(out);
                  else row[j] = new String("?");
                }
                out++;
              }
            }
          }
          if (!inst.existsAnyMissingValue()) {
            file_write.write(row[0]);
            for (int y = 1; y < nvariables; y++) {
              file_write.write("," + row[y]);
            }
            file_write.write("\n");
          }
        }
      } catch (Exception e) {
        System.out.println("Dataset exception = " + e);
        e.printStackTrace();
        System.exit(-1);
      }
      file_write.close();
    } catch (IOException e) {
      System.out.println("IO exception = " + e);
      e.printStackTrace();
      System.exit(-1);
    }

    /** ************************************************************************************ */
    // does a test file associated exist?
    if (input_train_name.compareTo(input_test_name) != 0) {
      try {
        FileWriter file_write = new FileWriter(output_test_name);

        try {

          // Load in memory a dataset that contains a classification problem
          IS.readSet(input_test_name, false);
          int in = 0;
          int out = 0;
          int in2 = 0;
          int out2 = 0;
          int lastMissing = -1;
          boolean fin = false;
          boolean stepNext = false;

          ndatos = IS.getNumInstances();
          nvariables = Attributes.getNumAttributes();
          nentradas = Attributes.getInputNumAttributes();
          nsalidas = Attributes.getOutputNumAttributes();

          String[] row = null;
          X = new Vector[ndatos]; // matrix with transformed data
          for (int i = 0; i < ndatos; i++) X[i] = new Vector();

          timesSeen = new FreqList[nvariables];
          mostCommon = new String[nvariables];

          file_write.write(IS.getHeader());

          // now, print the normalized data
          file_write.write("@data\n");

          // now, search for missed data, and replace them with
          // the most common value

          for (int i = 0; i < ndatos; i++) {
            Instance inst = IS.getInstance(i);
            in = 0;
            out = 0;
            row = new String[nvariables];

            for (int j = 0; j < nvariables; j++) {
              Attribute a = Attributes.getAttribute(j);

              direccion = a.getDirectionAttribute();
              tipo = a.getType();

              if (direccion == Attribute.INPUT) {
                if (tipo != Attribute.NOMINAL && !inst.existsAnyMissingValue()) {
                  row[j] = new String(String.valueOf(inst.getInputRealValues(in)));
                } else {
                  if (!inst.existsAnyMissingValue()) row[j] = inst.getInputNominalValues(in);
                  else {
                    // missing data
                    outputs = inst.getAllOutputValues();
                    in2 = 0;
                    out2 = 0;
                    for (int attr = 0; attr < nvariables; attr++) {
                      Attribute b = Attributes.getAttribute(attr);
                      direccion = b.getDirectionAttribute();
                      tipo = b.getType();
                      if (direccion == Attribute.INPUT) {
                        if (tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in2)) {
                          row[attr] = new String(String.valueOf(inst.getInputRealValues(in2)));
                        } else {
                          if (!inst.getInputMissingValues(in2))
                            row[attr] = inst.getInputNominalValues(in2);
                        }
                        in2++;
                      } else {
                        if (direccion == Attribute.OUTPUT) {
                          if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out2)) {
                            row[attr] = new String(String.valueOf(inst.getOutputRealValues(out2)));
                          } else {
                            if (!inst.getOutputMissingValues(out2))
                              row[attr] = inst.getOutputNominalValues(out2);
                          }
                          out2++;
                        }
                      }
                    }
                    // make frecuencies  for each attribute
                    for (int attr = 0; attr < nvariables; attr++) {
                      Attribute b = Attributes.getAttribute(attr);

                      direccion = b.getDirectionAttribute();
                      tipo = b.getType();
                      if (direccion == Attribute.INPUT && inst.getInputMissingValues(attr)) {
                        lastMissing = attr;
                        timesSeen[attr] = new FreqList();
                        for (int m = 0; m < ndatos; m++) {
                          Instance inst2 = IS.getInstance(m);
                          outputs2 = inst2.getAllOutputValues();
                          boolean sameClass = true;
                          // are they same concept instances??
                          for (int k = 0; k < nsalidas && sameClass; k++)
                            if (outputs[k] != outputs2[k]) sameClass = false;
                          if (sameClass) {
                            if (tipo != Attribute.NOMINAL && !inst2.getInputMissingValues(attr)) {
                              timesSeen[attr].AddElement(
                                  new String(String.valueOf(inst2.getInputRealValues(attr))));

                            } else {
                              if (!inst2.getInputMissingValues(attr)) {
                                timesSeen[attr].AddElement(inst2.getInputNominalValues(attr));
                              }
                            }
                          }
                        }
                      }
                    }
                    for (int attr = 0; attr < nvariables; attr++) {
                      if (direccion == Attribute.INPUT && inst.getInputMissingValues(attr)) {
                        timesSeen[attr].reset();
                      }
                    }
                    fin = false;
                    stepNext = false;
                    while (!fin) {
                      in2 = 0;
                      for (int attr = 0; attr < nvariables && !fin; attr++) {
                        Attribute b = Attributes.getAttribute(attr);

                        direccion = b.getDirectionAttribute();
                        tipo = b.getType();
                        if (direccion == Attribute.INPUT && inst.getInputMissingValues(in2)) {
                          if (stepNext) {
                            timesSeen[attr].iterate();
                            stepNext = false;
                          }
                          if (timesSeen[attr].outOfBounds()) {
                            stepNext = true;
                            if (attr == lastMissing) fin = true;
                            timesSeen[attr].reset();
                          }
                          if (!fin)
                            row[attr] =
                                ((ValueFreq) timesSeen[attr].getCurrent())
                                    .getValue(); // replace missing data
                        }
                        in2++;
                      }
                      if (!fin) {
                        stepNext = true;
                        file_write.write(row[0]);
                        for (int y = 1; y < nvariables; y++) {
                          file_write.write("," + row[y]);
                        }
                        file_write.write("\n");
                        // X[i].addElement(row);
                        // row = (String[])row.clone();
                      }
                    }
                  }
                }
                in++;
              } else {
                if (direccion == Attribute.OUTPUT) {
                  if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)) {
                    row[j] = new String(String.valueOf(inst.getOutputRealValues(out)));
                  } else {
                    if (!inst.getOutputMissingValues(out))
                      row[j] = inst.getOutputNominalValues(out);
                    else row[j] = new String("?");
                  }
                  out++;
                }
              }
            }
            if (!inst.existsAnyMissingValue()) {
              file_write.write(row[0]);
              for (int y = 1; y < nvariables; y++) {
                file_write.write("," + row[y]);
              }
              file_write.write("\n");
            }
          }
        } catch (Exception e) {
          System.out.println("Dataset exception = " + e);
          e.printStackTrace();
          System.exit(-1);
        }
        file_write.close();
      } catch (IOException e) {
        System.out.println("IO exception = " + e);
        e.printStackTrace();
        System.exit(-1);
      }
    }
  }