/** * Computes the distance between two instances (without previous normalization) * * @param i First instance * @param j Second instance * @return The Euclidean distance between i and j */ private double distance(Instance i, Instance j) { double dist = 0; int in = 0; int out = 0; for (int l = 0; l < nvariables; l++) { Attribute a = Attributes.getAttribute(l); direccion = a.getDirectionAttribute(); tipo = a.getType(); if (direccion == Attribute.INPUT) { if (tipo != Attribute.NOMINAL && !i.getInputMissingValues(in)) { // real value, apply euclidean distance dist += (i.getInputRealValues(in) - j.getInputRealValues(in)) * (i.getInputRealValues(in) - j.getInputRealValues(in)); } else { if (!i.getInputMissingValues(in) && i.getInputNominalValues(in) != j.getInputNominalValues(in)) dist += 1; } in++; } else { if (direccion == Attribute.OUTPUT) { if (tipo != Attribute.NOMINAL && !i.getOutputMissingValues(out)) { dist += (i.getOutputRealValues(out) - j.getOutputRealValues(out)) * (i.getOutputRealValues(out) - j.getOutputRealValues(out)); } else { if (!i.getOutputMissingValues(out) && i.getOutputNominalValues(out) != j.getOutputNominalValues(out)) dist += 1; } out++; } } } return dist; }
/** Process the training and test files provided in the parameters file to the constructor. */ public void process() { // declarations double[] outputs; double[] outputs2; Instance neighbor; double dist, mean; int actual; Randomize rnd = new Randomize(); Instance ex; gCenter kmeans = null; int iterations = 0; double E; double prevE; int totalMissing = 0; boolean allMissing = true; rnd.setSeed(semilla); // PROCESS try { // Load in memory a dataset that contains a classification problem IS.readSet(input_train_name, true); int in = 0; int out = 0; ndatos = IS.getNumInstances(); nvariables = Attributes.getNumAttributes(); nentradas = Attributes.getInputNumAttributes(); nsalidas = Attributes.getOutputNumAttributes(); X = new String[ndatos][nvariables]; // matrix with transformed data kmeans = new gCenter(K, ndatos, nvariables); timesSeen = new FreqList[nvariables]; mostCommon = new String[nvariables]; // first, we choose k 'means' randomly from all // instances totalMissing = 0; for (int i = 0; i < ndatos; i++) { Instance inst = IS.getInstance(i); if (inst.existsAnyMissingValue()) totalMissing++; } if (totalMissing == ndatos) allMissing = true; else allMissing = false; for (int numMeans = 0; numMeans < K; numMeans++) { do { actual = (int) (ndatos * rnd.Rand()); ex = IS.getInstance(actual); } while (ex.existsAnyMissingValue() && !allMissing); kmeans.copyCenter(ex, numMeans); } // now, iterate adjusting clusters' centers and // instances to them prevE = 0; iterations = 0; do { for (int i = 0; i < ndatos; i++) { Instance inst = IS.getInstance(i); kmeans.setClusterOf(inst, i); } // set new centers kmeans.recalculateCenters(IS); // compute RMSE E = 0; for (int i = 0; i < ndatos; i++) { Instance inst = IS.getInstance(i); E += kmeans.distance(inst, kmeans.getClusterOf(i)); } iterations++; // System.out.println(iterations+"\t"+E); if (Math.abs(prevE - E) == 0) iterations = maxIter; else prevE = E; } while (E > minError && iterations < maxIter); for (int i = 0; i < ndatos; i++) { Instance inst = IS.getInstance(i); in = 0; out = 0; for (int j = 0; j < nvariables; j++) { Attribute a = Attributes.getAttribute(j); direccion = a.getDirectionAttribute(); tipo = a.getType(); if (direccion == Attribute.INPUT) { if (tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in)) { X[i][j] = new String(String.valueOf(inst.getInputRealValues(in))); } else { if (!inst.getInputMissingValues(in)) X[i][j] = inst.getInputNominalValues(in); else { actual = kmeans.getClusterOf(i); X[i][j] = new String(kmeans.valueAt(actual, j)); } } in++; } else { if (direccion == Attribute.OUTPUT) { if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)) { X[i][j] = new String(String.valueOf(inst.getOutputRealValues(out))); } else { if (!inst.getOutputMissingValues(out)) X[i][j] = inst.getOutputNominalValues(out); else { actual = kmeans.getClusterOf(i); X[i][j] = new String(kmeans.valueAt(actual, j)); } } out++; } } } } } catch (Exception e) { System.out.println("Dataset exception = " + e); e.printStackTrace(); System.exit(-1); } write_results(output_train_name); /** ************************************************************************************ */ // does a test file associated exist? if (input_train_name.compareTo(input_test_name) != 0) { try { // Load in memory a dataset that contains a classification problem IStest.readSet(input_test_name, false); int in = 0; int out = 0; ndatos = IStest.getNumInstances(); nvariables = Attributes.getNumAttributes(); nentradas = Attributes.getInputNumAttributes(); nsalidas = Attributes.getOutputNumAttributes(); for (int i = 0; i < ndatos; i++) { Instance inst = IStest.getInstance(i); in = 0; out = 0; for (int j = 0; j < nvariables; j++) { Attribute a = Attributes.getAttribute(j); direccion = a.getDirectionAttribute(); tipo = a.getType(); if (direccion == Attribute.INPUT) { if (tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in)) { X[i][j] = new String(String.valueOf(inst.getInputRealValues(in))); } else { if (!inst.getInputMissingValues(in)) X[i][j] = inst.getInputNominalValues(in); else { actual = kmeans.getClusterOf(i); X[i][j] = new String(kmeans.valueAt(actual, j)); } } in++; } else { if (direccion == Attribute.OUTPUT) { if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)) { X[i][j] = new String(String.valueOf(inst.getOutputRealValues(out))); } else { if (!inst.getOutputMissingValues(out)) X[i][j] = inst.getOutputNominalValues(out); else { actual = kmeans.getClusterOf(i); X[i][j] = new String(kmeans.valueAt(actual, j)); } } out++; } } } } } catch (Exception e) { System.out.println("Dataset exception = " + e); e.printStackTrace(); System.exit(-1); } write_results(output_test_name); } }
/** Process the training and test files provided in the parameters file to the constructor. */ public void process() { double[] outputs; double[] outputs2; try { FileWriter file_write = new FileWriter(output_train_name); try { // Load in memory a dataset that contains a classification problem IS.readSet(input_train_name, true); int in = 0; int out = 0; int in2 = 0; int out2 = 0; int lastMissing = -1; boolean fin = false; boolean stepNext = false; ndatos = IS.getNumInstances(); nvariables = Attributes.getNumAttributes(); nentradas = Attributes.getInputNumAttributes(); nsalidas = Attributes.getOutputNumAttributes(); String[] row = null; X = new Vector[ndatos]; // matrix with transformed data for (int i = 0; i < ndatos; i++) X[i] = new Vector(); timesSeen = new FreqList[nvariables]; mostCommon = new String[nvariables]; file_write.write(IS.getHeader()); // now, print the normalized data file_write.write("@data\n"); // now, search for missed data, and replace them with // the most common value for (int i = 0; i < ndatos; i++) { Instance inst = IS.getInstance(i); in = 0; out = 0; row = new String[nvariables]; for (int j = 0; j < nvariables; j++) { Attribute a = Attributes.getAttribute(j); direccion = a.getDirectionAttribute(); tipo = a.getType(); if (direccion == Attribute.INPUT) { if (tipo != Attribute.NOMINAL && !inst.existsAnyMissingValue()) { row[j] = new String(String.valueOf(inst.getInputRealValues(in))); } else { if (!inst.existsAnyMissingValue()) row[j] = inst.getInputNominalValues(in); else { // missing data outputs = inst.getAllOutputValues(); in2 = 0; out2 = 0; for (int attr = 0; attr < nvariables; attr++) { Attribute b = Attributes.getAttribute(attr); direccion = b.getDirectionAttribute(); tipo = b.getType(); if (direccion == Attribute.INPUT) { if (tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in2)) { row[attr] = new String(String.valueOf(inst.getInputRealValues(in2))); } else { if (!inst.getInputMissingValues(in2)) row[attr] = inst.getInputNominalValues(in2); } in2++; } else { if (direccion == Attribute.OUTPUT) { if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out2)) { row[attr] = new String(String.valueOf(inst.getOutputRealValues(out2))); } else { if (!inst.getOutputMissingValues(out2)) row[attr] = inst.getOutputNominalValues(out2); } out2++; } } } // make frecuencies for each attribute for (int attr = 0; attr < nvariables; attr++) { Attribute b = Attributes.getAttribute(attr); direccion = b.getDirectionAttribute(); tipo = b.getType(); if (direccion == Attribute.INPUT && inst.getInputMissingValues(attr)) { lastMissing = attr; timesSeen[attr] = new FreqList(); for (int m = 0; m < ndatos; m++) { Instance inst2 = IS.getInstance(m); outputs2 = inst2.getAllOutputValues(); boolean sameClass = true; // are they same concept instances?? for (int k = 0; k < nsalidas && sameClass; k++) if (outputs[k] != outputs2[k]) sameClass = false; if (sameClass) { if (tipo != Attribute.NOMINAL && !inst2.getInputMissingValues(attr)) { timesSeen[attr].AddElement( new String(String.valueOf(inst2.getInputRealValues(attr)))); } else { if (!inst2.getInputMissingValues(attr)) { timesSeen[attr].AddElement(inst2.getInputNominalValues(attr)); } } } } } } for (int attr = 0; attr < nvariables; attr++) { if (direccion == Attribute.INPUT && inst.getInputMissingValues(attr)) { timesSeen[attr].reset(); } } fin = false; stepNext = false; while (!fin) { in2 = 0; for (int attr = 0; attr < nvariables && !fin; attr++) { Attribute b = Attributes.getAttribute(attr); direccion = b.getDirectionAttribute(); tipo = b.getType(); if (direccion == Attribute.INPUT && inst.getInputMissingValues(in2)) { if (stepNext) { timesSeen[attr].iterate(); stepNext = false; } if (timesSeen[attr].outOfBounds()) { stepNext = true; if (attr == lastMissing) fin = true; timesSeen[attr].reset(); } if (!fin) row[attr] = ((ValueFreq) timesSeen[attr].getCurrent()) .getValue(); // replace missing data } in2++; } if (!fin) { stepNext = true; file_write.write(row[0]); for (int y = 1; y < nvariables; y++) { file_write.write("," + row[y]); } file_write.write("\n"); // X[i].addElement(row); // row = (String[])row.clone(); } } } } in++; } else { if (direccion == Attribute.OUTPUT) { if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)) { row[j] = new String(String.valueOf(inst.getOutputRealValues(out))); } else { if (!inst.getOutputMissingValues(out)) row[j] = inst.getOutputNominalValues(out); else row[j] = new String("?"); } out++; } } } if (!inst.existsAnyMissingValue()) { file_write.write(row[0]); for (int y = 1; y < nvariables; y++) { file_write.write("," + row[y]); } file_write.write("\n"); } } } catch (Exception e) { System.out.println("Dataset exception = " + e); e.printStackTrace(); System.exit(-1); } file_write.close(); } catch (IOException e) { System.out.println("IO exception = " + e); e.printStackTrace(); System.exit(-1); } /** ************************************************************************************ */ // does a test file associated exist? if (input_train_name.compareTo(input_test_name) != 0) { try { FileWriter file_write = new FileWriter(output_test_name); try { // Load in memory a dataset that contains a classification problem IS.readSet(input_test_name, false); int in = 0; int out = 0; int in2 = 0; int out2 = 0; int lastMissing = -1; boolean fin = false; boolean stepNext = false; ndatos = IS.getNumInstances(); nvariables = Attributes.getNumAttributes(); nentradas = Attributes.getInputNumAttributes(); nsalidas = Attributes.getOutputNumAttributes(); String[] row = null; X = new Vector[ndatos]; // matrix with transformed data for (int i = 0; i < ndatos; i++) X[i] = new Vector(); timesSeen = new FreqList[nvariables]; mostCommon = new String[nvariables]; file_write.write(IS.getHeader()); // now, print the normalized data file_write.write("@data\n"); // now, search for missed data, and replace them with // the most common value for (int i = 0; i < ndatos; i++) { Instance inst = IS.getInstance(i); in = 0; out = 0; row = new String[nvariables]; for (int j = 0; j < nvariables; j++) { Attribute a = Attributes.getAttribute(j); direccion = a.getDirectionAttribute(); tipo = a.getType(); if (direccion == Attribute.INPUT) { if (tipo != Attribute.NOMINAL && !inst.existsAnyMissingValue()) { row[j] = new String(String.valueOf(inst.getInputRealValues(in))); } else { if (!inst.existsAnyMissingValue()) row[j] = inst.getInputNominalValues(in); else { // missing data outputs = inst.getAllOutputValues(); in2 = 0; out2 = 0; for (int attr = 0; attr < nvariables; attr++) { Attribute b = Attributes.getAttribute(attr); direccion = b.getDirectionAttribute(); tipo = b.getType(); if (direccion == Attribute.INPUT) { if (tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in2)) { row[attr] = new String(String.valueOf(inst.getInputRealValues(in2))); } else { if (!inst.getInputMissingValues(in2)) row[attr] = inst.getInputNominalValues(in2); } in2++; } else { if (direccion == Attribute.OUTPUT) { if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out2)) { row[attr] = new String(String.valueOf(inst.getOutputRealValues(out2))); } else { if (!inst.getOutputMissingValues(out2)) row[attr] = inst.getOutputNominalValues(out2); } out2++; } } } // make frecuencies for each attribute for (int attr = 0; attr < nvariables; attr++) { Attribute b = Attributes.getAttribute(attr); direccion = b.getDirectionAttribute(); tipo = b.getType(); if (direccion == Attribute.INPUT && inst.getInputMissingValues(attr)) { lastMissing = attr; timesSeen[attr] = new FreqList(); for (int m = 0; m < ndatos; m++) { Instance inst2 = IS.getInstance(m); outputs2 = inst2.getAllOutputValues(); boolean sameClass = true; // are they same concept instances?? for (int k = 0; k < nsalidas && sameClass; k++) if (outputs[k] != outputs2[k]) sameClass = false; if (sameClass) { if (tipo != Attribute.NOMINAL && !inst2.getInputMissingValues(attr)) { timesSeen[attr].AddElement( new String(String.valueOf(inst2.getInputRealValues(attr)))); } else { if (!inst2.getInputMissingValues(attr)) { timesSeen[attr].AddElement(inst2.getInputNominalValues(attr)); } } } } } } for (int attr = 0; attr < nvariables; attr++) { if (direccion == Attribute.INPUT && inst.getInputMissingValues(attr)) { timesSeen[attr].reset(); } } fin = false; stepNext = false; while (!fin) { in2 = 0; for (int attr = 0; attr < nvariables && !fin; attr++) { Attribute b = Attributes.getAttribute(attr); direccion = b.getDirectionAttribute(); tipo = b.getType(); if (direccion == Attribute.INPUT && inst.getInputMissingValues(in2)) { if (stepNext) { timesSeen[attr].iterate(); stepNext = false; } if (timesSeen[attr].outOfBounds()) { stepNext = true; if (attr == lastMissing) fin = true; timesSeen[attr].reset(); } if (!fin) row[attr] = ((ValueFreq) timesSeen[attr].getCurrent()) .getValue(); // replace missing data } in2++; } if (!fin) { stepNext = true; file_write.write(row[0]); for (int y = 1; y < nvariables; y++) { file_write.write("," + row[y]); } file_write.write("\n"); // X[i].addElement(row); // row = (String[])row.clone(); } } } } in++; } else { if (direccion == Attribute.OUTPUT) { if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)) { row[j] = new String(String.valueOf(inst.getOutputRealValues(out))); } else { if (!inst.getOutputMissingValues(out)) row[j] = inst.getOutputNominalValues(out); else row[j] = new String("?"); } out++; } } } if (!inst.existsAnyMissingValue()) { file_write.write(row[0]); for (int y = 1; y < nvariables; y++) { file_write.write("," + row[y]); } file_write.write("\n"); } } } catch (Exception e) { System.out.println("Dataset exception = " + e); e.printStackTrace(); System.exit(-1); } file_write.close(); } catch (IOException e) { System.out.println("IO exception = " + e); e.printStackTrace(); System.exit(-1); } } }