Ejemplo n.º 1
0
  /** This is (mostly) copied from CRF4.java */
  public boolean[][] labelConnectionsIn(
      Alphabet outputAlphabet, InstanceList trainingSet, String start) {
    int numLabels = outputAlphabet.size();
    boolean[][] connections = new boolean[numLabels][numLabels];
    for (int i = 0; i < trainingSet.size(); i++) {
      Instance instance = trainingSet.getInstance(i);
      FeatureSequence output = (FeatureSequence) instance.getTarget();
      for (int j = 1; j < output.size(); j++) {
        int sourceIndex = outputAlphabet.lookupIndex(output.get(j - 1));
        int destIndex = outputAlphabet.lookupIndex(output.get(j));
        assert (sourceIndex >= 0 && destIndex >= 0);
        connections[sourceIndex][destIndex] = true;
      }
    }

    // Handle start state
    if (start != null) {
      int startIndex = outputAlphabet.lookupIndex(start);
      for (int j = 0; j < outputAlphabet.size(); j++) {
        connections[startIndex][j] = true;
      }
    }

    return connections;
  }
  public int accept(int ins_num, int income_seq, String content) throws RemoteException {
    if (!work) return -1;

    Instance res = instances.get(ins_num);

    if (res == null) return -1;
    else {

      if (income_seq >= res.n_p) {

        res.n_p = income_seq;

        res.n_a = income_seq;

        res.content = content;

        res.decided = true;

        INS_NUM++;

        return income_seq;
      }
    }

    return -1;
  }
Ejemplo n.º 3
0
  public void count() {

    TIntIntHashMap docCounts = new TIntIntHashMap();

    int index = 0;

    if (instances.size() == 0) {
      logger.info("Instance list is empty");
      return;
    }

    if (instances.get(0).getData() instanceof FeatureSequence) {

      for (Instance instance : instances) {
        FeatureSequence features = (FeatureSequence) instance.getData();

        for (int i = 0; i < features.getLength(); i++) {
          docCounts.adjustOrPutValue(features.getIndexAtPosition(i), 1, 1);
        }

        int[] keys = docCounts.keys();
        for (int i = 0; i < keys.length - 1; i++) {
          int feature = keys[i];
          featureCounts[feature] += docCounts.get(feature);
          documentFrequencies[feature]++;
        }

        docCounts = new TIntIntHashMap();

        index++;
        if (index % 1000 == 0) {
          System.err.println(index);
        }
      }
    } else if (instances.get(0).getData() instanceof FeatureVector) {

      for (Instance instance : instances) {
        FeatureVector features = (FeatureVector) instance.getData();

        for (int location = 0; location < features.numLocations(); location++) {
          int feature = features.indexAtLocation(location);
          double value = features.valueAtLocation(location);

          documentFrequencies[feature]++;
          featureCounts[feature] += value;
        }

        index++;
        if (index % 1000 == 0) {
          System.err.println(index);
        }
      }
    } else {
      logger.info("Unsupported data class: " + instances.get(0).getData().getClass().getName());
    }
  }
Ejemplo n.º 4
0
 public String toString() {
   StringBuffer buf = new StringBuffer();
   for (int i = 0; i < numSources(); i++) {
     String src = getSource(i);
     for (int j = 0; j < numInstances(src); j++) {
       Instance inst = getInstance(src, j);
       buf.append(inst.toString() + "\n");
     }
   }
   return buf.toString();
 }
  public void dump() throws RemoteException {
    int num_ins = instances.size();

    System.out.println(getServerName() + ":");

    for (int i = 1; i <= num_ins; i++) {
      Instance tmp = instances.get(i);

      System.out.println(i + ": " + tmp.toString());
    }

    return;
  }
Ejemplo n.º 6
0
  /**
   * Process a dataset for classification
   *
   * @return Nothing
   */
  public void processClassifierDataset() throws IOException {
    try {

      ndata = IS.getNumInstances();
      ninputs = Attributes.getInputNumAttributes();
      nvariables = ninputs + Attributes.getOutputNumAttributes();
      // Check that there is only one output variable and
      // it is nominal
      if (Attributes.getOutputNumAttributes() > 1) {
        System.out.println("This algorithm can not process MIMO datasets");
        System.out.println("All outputs but the first one will be removed");
      }
      boolean noOutputs = false;
      if (Attributes.getOutputNumAttributes() < 1) {
        System.out.println("This algorithm cannot process datasets without outputs");
        System.out.println("Zero-valued output generated");
        noOutputs = true;
      }
      // Initialice and fill our own tables
      X = new double[ndata][ninputs];
      missing = new boolean[ndata][ninputs];
      C = new int[ndata];
      // Maximum and minimum of inputs
      imax = new double[ninputs];
      imin = new double[ninputs];
      // Maximum and minimum for output data
      omax = 0;
      omin = 0;
      // All values are casted into double/integer
      nclasses = 0;
      for (int i = 0; i < X.length; i++) {
        Instance inst = IS.getInstance(i);
        for (int j = 0; j < ninputs; j++) {
          X[i][j] = IS.getInputNumericValue(i, j);
          missing[i][j] = inst.getInputMissingValues(j);
          if (X[i][j] > imax[j] || i == 0) imax[j] = X[i][j];
          if (X[i][j] < imin[j] || i == 0) imin[j] = X[i][j];
        }
        if (noOutputs) C[i] = 0;
        else C[i] = (int) IS.getOutputNumericValue(i, 0);
        if (C[i] > nclasses) nclasses = C[i];
      }
      nclasses++;
      System.out.println("Number of classes=" + nclasses);
    } catch (Exception e) {
      System.out.println("DBG: Exception in readSet");
      e.printStackTrace();
    }
  }
Ejemplo n.º 7
0
  /**
   * Process a dataset file for a clustering problem.
   *
   * @param nfexamples Name of the dataset file
   * @param train The dataset file is for training or for test
   * @throws java.io.IOException if there is any semantical, lexical or sintactical error in the
   *     input file.
   */
  public void processClusterDataset(String nfexamples, boolean train) throws IOException {

    try {

      // Load in memory a dataset that contains a classification problem
      IS.readSet(nfexamples, train);

      nData = IS.getNumInstances();
      nInputs = Attributes.getInputNumAttributes();
      nVariables = nInputs + Attributes.getOutputNumAttributes();

      if (Attributes.getOutputNumAttributes() != 0) {
        System.out.println("This algorithm can not process datasets with outputs");
        System.out.println("All outputs will be removed");
      }

      // Initialize and fill our own tables
      X = new double[nData][nInputs];
      missing = new boolean[nData][nInputs];

      // Maximum and minimum of inputs
      iMaximum = new double[nInputs];
      iMinimum = new double[nInputs];

      // Maximum and minimum for output data
      oMaximum = 0;
      oMinimum = 0;

      // All values are casted into double/integer
      nClasses = 0;
      for (int i = 0; i < X.length; i++) {
        Instance inst = IS.getInstance(i);
        for (int j = 0; j < nInputs; j++) {
          X[i][j] = IS.getInputNumericValue(i, j);
          missing[i][j] = inst.getInputMissingValues(j);
          if (X[i][j] > iMaximum[j] || i == 0) {
            iMaximum[j] = X[i][j];
          }
          if (X[i][j] < iMinimum[j] || i == 0) {
            iMinimum[j] = X[i][j];
          }
        }
      }

    } catch (Exception e) {
      System.out.println("DBG: Exception in readSet");
      e.printStackTrace();
    }
  }
Ejemplo n.º 8
0
  /**
   * Process a dataset for modelling
   *
   * @return Nothing
   */
  public void processModelDataset() throws IOException {
    try {

      // Load in memory a dataset that contains a classification problem
      // IS.readSet(nfejemplos,train);
      ndata = IS.getNumInstances();
      ninputs = Attributes.getInputNumAttributes();
      nvariables = ninputs + Attributes.getOutputNumAttributes();
      if (Attributes.getOutputNumAttributes() > 1) {
        System.out.println("This algorithm can not process MIMO datasets");
        System.out.println("All outputs but the first one will be removed");
      }
      boolean noOutputs = false;
      if (Attributes.getOutputNumAttributes() < 1) {
        System.out.println("This algorithm can not process datasets without outputs");
        System.out.println("Zero-valued output generated");
        noOutputs = true;
      }
      // Initialice and fill our own tables
      X = new double[ndata][ninputs];
      missing = new boolean[ndata][ninputs];
      Y = new double[ndata];
      // Maximum and minimum of inputs
      imax = new double[ninputs];
      imin = new double[ninputs];
      // Maximum and minimum for output data
      omax = 0;
      omin = 0;
      // All values are casted into double/integer
      nclasses = 0;
      for (int i = 0; i < X.length; i++) {
        Instance inst = IS.getInstance(i);
        for (int j = 0; j < ninputs; j++) {
          X[i][j] = IS.getInputNumericValue(i, j);
          missing[i][j] = inst.getInputMissingValues(j);
          if (X[i][j] > imax[j] || i == 0) imax[j] = X[i][j];
          if (X[i][j] < imin[j] || i == 0) imin[j] = X[i][j];
        }
        if (noOutputs) Y[i] = 0;
        else Y[i] = IS.getOutputNumericValue(i, 0);
        if (Y[i] > omax || i == 0) omax = Y[i];
        if (Y[i] < omin || i == 0) omin = Y[i];
      }
    } catch (Exception e) {
      System.out.println("DBG: Exception in readSet");
      e.printStackTrace();
    }
  }
  public Instance prepare(int ins_num, int income_seq) throws RemoteException {
    if (!work) return null;

    Instance res = instances.get(ins_num);

    if (res == null) return null;
    else {

      if (income_seq > res.n_p) {

        res.n_p = income_seq;

        return res;
      } else return null;
    }
  }
Ejemplo n.º 10
0
  /**
   * Calculates the distance between two instances
   *
   * @param test the first instance
   * @param train the second instance
   * @return the distance between the two given instances, between 0 and 1
   */
  protected double distance(Instance first, Instance second) {

    double distance = 0;
    int firstI, secondI;

    for (int p1 = 0, p2 = 0; p1 < first.numValues() || p2 < second.numValues(); ) {
      if (p1 >= first.numValues()) {
        firstI = m_instances.numAttributes();
      } else {
        firstI = first.index(p1);
      }
      if (p2 >= second.numValues()) {
        secondI = m_instances.numAttributes();
      } else {
        secondI = second.index(p2);
      }
      if (firstI == m_instances.classIndex()) {
        p1++;
        continue;
      }
      if (secondI == m_instances.classIndex()) {
        p2++;
        continue;
      }
      double diff;
      if (firstI == secondI) {
        diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2));
        p1++;
        p2++;
      } else if (firstI > secondI) {
        diff = difference(secondI, 0, second.valueSparse(p2));
        p2++;
      } else {
        diff = difference(firstI, first.valueSparse(p1), 0);
        p1++;
      }
      distance += diff * diff;
    }

    return Math.sqrt(distance / m_instances.numAttributes());
  }
Ejemplo n.º 11
0
  /**
   * Input an instance for filtering. Ordinarily the instance is processed and made available for
   * output immediately. Some filters require all instances be read before producing output.
   *
   * @param instance the input instance
   * @return true if the filtered instance may now be collected with output().
   * @exception IllegalStateException if no input format has been defined.
   */
  public boolean input(Instance instance) {

    if (getInputFormat() == null) {
      throw new IllegalStateException("No input instance format defined");
    }
    if (m_NewBatch) {
      resetQueue();
      m_NewBatch = false;
    }
    push((Instance) instance.copy());
    return true;
  }
Ejemplo n.º 12
0
 public Instance pipe(Instance carrier) {
   TokenSequence ts = (TokenSequence) carrier.getData();
   TokenSequence targets =
       carrier.getTarget() instanceof TokenSequence ? (TokenSequence) carrier.getTarget() : null;
   TokenSequence source =
       carrier.getSource() instanceof TokenSequence ? (TokenSequence) carrier.getSource() : null;
   StringBuffer sb = new StringBuffer();
   if (prefix != null) sb.append(prefix);
   sb.append("name: " + carrier.getName() + "\n");
   for (int i = 0; i < ts.size(); i++) {
     if (source != null) {
       sb.append(source.get(i).getText());
       sb.append(' ');
     }
     if (carrier.getTarget() instanceof TokenSequence) {
       sb.append(((TokenSequence) carrier.getTarget()).get(i).getText());
       sb.append(' ');
     }
     if (carrier.getTarget() instanceof FeatureSequence) {
       sb.append(((FeatureSequence) carrier.getTarget()).getObjectAtPosition(i).toString());
       sb.append(' ');
     }
     PropertyList pl = ts.get(i).getFeatures();
     if (pl != null) {
       PropertyList.Iterator iter = pl.iterator();
       while (iter.hasNext()) {
         iter.next();
         double v = iter.getNumericValue();
         if (v == 1.0) sb.append(iter.getKey());
         else sb.append(iter.getKey() + '=' + v);
         sb.append(' ');
       }
     }
     sb.append('\n');
   }
   System.out.print(sb.toString());
   return carrier;
 }
  public void decide(int income_seq, Operation op) throws RemoteException {

    // set corresponding Op decided, assume no collision happens

    if (!work) return;

    if (op == null) return;

    Instance ins = instances.get(op.op_num);

    if (!ins.decided) {

      ins.decided = true;

      INS_NUM++;

      ins.n_a = income_seq;

      ins.content = op.content;
    }

    return;
  }
Ejemplo n.º 14
0
 /* The main tree traversal function; if node is a leaf,
  * return classification; otherwise, determine whether
  * instance is higher or lower than this attribute-value
  * pair and returns testInstance of proper child
  */
 public boolean testInstance(Instance in) {
   if (majority) {
     return majClass;
   }
   if (leaf) {
     return classification;
   } else {
     if (in.get((int) attribute) <= value) {
       return leftChild.testInstance(in);
     } else {
       return rightChild.testInstance(in);
     }
   }
 }
Ejemplo n.º 15
0
 public Instance pipe(Instance carrier) {
   TokenSequence ts = (TokenSequence) carrier.getData();
   for (int i = 0; i < ts.size(); i++) {
     Token t = ts.get(i);
     String s = t.getText();
     if (distinguishBorders) s = startBorderChar + s + endBorderChar;
     int slen = s.length();
     for (int j = 0; j < gramSizes.length; j++) {
       int size = gramSizes[j];
       for (int k = 0; k < (slen - size) + 1; k++)
         t.setFeatureValue((prefix + s.substring(k, k + size)).intern(), 1.0);
     }
   }
   return carrier;
 }
Ejemplo n.º 16
0
 public Instance pipe(Instance carrier) {
   TokenSequence ts = (TokenSequence) carrier.getData();
   for (int i = 0; i < ts.size(); i++) {
     Token t = ts.get(i);
     String s = t.getText();
     if (distinguishBorders) s = startBorderChar + s + endBorderChar;
     int slen = s.length();
     for (int j = 0; j < gramSizes.length; j++) {
       int size = gramSizes[j];
       for (int k = 0; k < slen - size; k++)
         t.setFeatureValue(
             s.substring(k, k + size), 1.0); // original was substring(k, size), changed by Fuchun
     }
   }
   return carrier;
 }
Ejemplo n.º 17
0
  /**
   * Updates the minimum and maximum values for all the attributes based on a new instance.
   *
   * @param instance the new instance
   */
  private void updateMinMax(Instance instance) {

    for (int j = 0; j < instance.numAttributes(); j++) {
      if (Double.isNaN(m_Min[j])) {
        m_Min[j] = instance.value(j);
        m_Max[j] = instance.value(j);
      } else {
        if (instance.value(j) < m_Min[j]) {
          m_Min[j] = instance.value(j);
        } else {
          if (instance.value(j) > m_Max[j]) {
            m_Max[j] = instance.value(j);
          }
        }
      }
    }
  }
Ejemplo n.º 18
0
  /** Computes the difference between two given attribute values. */
  protected double difference(int index, double val1, double val2) {

    switch (m_instances.attribute(index).type()) {
      case Attribute.NOMINAL:

        // If attribute is nominal
        if (Instance.isMissingValue(val1)
            || Instance.isMissingValue(val2)
            || ((int) val1 != (int) val2)) {
          return 1;
        } else {
          return 0;
        }
      case Attribute.NUMERIC:

        // If attribute is numeric
        if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2)) {
          if (Instance.isMissingValue(val1) && Instance.isMissingValue(val2)) {
            return 1;
          } else {
            double diff;
            if (Instance.isMissingValue(val2)) {
              diff = norm(val1, index);
            } else {
              diff = norm(val2, index);
            }
            if (diff < 0.5) {
              diff = 1.0 - diff;
            }
            return diff;
          }
        } else {
          return norm(val1, index) - norm(val2, index);
        }
      default:
        return 0;
    }
  }
Ejemplo n.º 19
0
  /** Process the training and test files provided in the parameters file to the constructor. */
  public void process() {
    try {

      // Load in memory a dataset that contains a classification problem
      IS.readSet(input_train_name, true);
      int in = 0;
      int out = 0;

      ndatos = IS.getNumInstances();
      nvariables = Attributes.getNumAttributes();
      nentradas = Attributes.getInputNumAttributes();
      nsalidas = Attributes.getOutputNumAttributes();

      X = new String[ndatos][nvariables]; // matrix with transformed data
      boolean[] isMissed =
          new boolean[ndatos]; // vector which points out instances with missed data

      try {
        FileWriter file_write = new FileWriter(output_train_name);

        file_write.write(IS.getHeader());

        // now, print the normalized data
        file_write.write("@data\n");
        // file_write.close();
        PrintWriter pw = new PrintWriter(file_write);
        for (int i = 0; i < ndatos; i++) {
          Instance inst = IS.getInstance(i);
          if (!inst.existsAnyMissingValue()) {
            inst.printAsOriginal(pw);
            // file_write.write(inst.toString()); // DOES NOT WRITE BACK NON-DEF DIRECTION
            // ATTRIBUTES!!!!
            file_write.write("\n");
          }
        }
        pw.close();
        file_write.close();
      } catch (IOException e) {
        System.out.println("IO exception = " + e);
        System.exit(-1);
      }

    } catch (Exception e) {
      System.out.println("Dataset exception = " + e);
      e.printStackTrace();
      System.exit(-1);
    }
    // does a test file associated exist?
    if (input_train_name.compareTo(input_test_name) != 0) {
      try {

        // Load in memory a dataset that contains a classification problem
        IS.readSet(input_test_name, false);
        int in = 0;
        int out = 0;

        ndatos = IS.getNumInstances();
        nvariables = Attributes.getNumAttributes();
        nentradas = Attributes.getInputNumAttributes();
        nsalidas = Attributes.getOutputNumAttributes();

        X = new String[ndatos][nvariables]; // matrix with transformed data
        boolean[] isMissed =
            new boolean[ndatos]; // vector which points out instances with missed data

        try {
          FileWriter file_write = new FileWriter(output_test_name);

          file_write.write(IS.getHeader());

          // now, print the normalized data
          file_write.write("@data\n");
          PrintWriter pw = new PrintWriter(file_write);
          for (int i = 0; i < ndatos; i++) {
            Instance inst = IS.getInstance(i);
            if (!inst.existsAnyMissingValue()) {
              inst.printAsOriginal(pw);
              file_write.write("\n");
            }
          }
          pw.close();
          file_write.close();
        } catch (IOException e) {
          System.out.println("IO exception = " + e);
          System.exit(-1);
        }

      } catch (Exception e) {
        System.out.println("Dataset exception = " + e);
        e.printStackTrace();
        System.exit(-1);
      }
    }

    // write_results(); / since there ins't any data transformation, is not needed
  }
Ejemplo n.º 20
0
  /**
   * Computes the distance between two instances (without previous normalization)
   *
   * @param i First instance
   * @param j Second instance
   * @return The Euclidean distance between i and j
   */
  private double distance(Instance i, Instance j) {
    double dist = 0;
    int in = 0;
    int out = 0;

    for (int l = 0; l < nvariables; l++) {
      Attribute a = Attributes.getAttribute(l);

      direccion = a.getDirectionAttribute();
      tipo = a.getType();

      if (direccion == Attribute.INPUT) {
        if (tipo != Attribute.NOMINAL && !i.getInputMissingValues(in)) {
          // real value, apply euclidean distance
          dist +=
              (i.getInputRealValues(in) - j.getInputRealValues(in))
                  * (i.getInputRealValues(in) - j.getInputRealValues(in));
        } else {
          if (!i.getInputMissingValues(in)
              && i.getInputNominalValues(in) != j.getInputNominalValues(in)) dist += 1;
        }
        in++;
      } else {
        if (direccion == Attribute.OUTPUT) {
          if (tipo != Attribute.NOMINAL && !i.getOutputMissingValues(out)) {
            dist +=
                (i.getOutputRealValues(out) - j.getOutputRealValues(out))
                    * (i.getOutputRealValues(out) - j.getOutputRealValues(out));
          } else {
            if (!i.getOutputMissingValues(out)
                && i.getOutputNominalValues(out) != j.getOutputNominalValues(out)) dist += 1;
          }
          out++;
        }
      }
    }
    return dist;
  }
Ejemplo n.º 21
0
  /** Process the training and test files provided in the parameters file to the constructor. */
  public void process() {
    // declarations
    double[] outputs;
    double[] outputs2;
    Instance neighbor;
    double dist, mean;
    int actual;
    Randomize rnd = new Randomize();
    Instance ex;
    gCenter kmeans = null;
    int iterations = 0;
    double E;
    double prevE;
    int totalMissing = 0;
    boolean allMissing = true;

    rnd.setSeed(semilla);
    // PROCESS
    try {

      // Load in memory a dataset that contains a classification problem
      IS.readSet(input_train_name, true);
      int in = 0;
      int out = 0;

      ndatos = IS.getNumInstances();
      nvariables = Attributes.getNumAttributes();
      nentradas = Attributes.getInputNumAttributes();
      nsalidas = Attributes.getOutputNumAttributes();

      X = new String[ndatos][nvariables]; // matrix with transformed data
      kmeans = new gCenter(K, ndatos, nvariables);

      timesSeen = new FreqList[nvariables];
      mostCommon = new String[nvariables];

      // first, we choose k 'means' randomly from all
      // instances
      totalMissing = 0;
      for (int i = 0; i < ndatos; i++) {
        Instance inst = IS.getInstance(i);
        if (inst.existsAnyMissingValue()) totalMissing++;
      }
      if (totalMissing == ndatos) allMissing = true;
      else allMissing = false;
      for (int numMeans = 0; numMeans < K; numMeans++) {
        do {
          actual = (int) (ndatos * rnd.Rand());
          ex = IS.getInstance(actual);
        } while (ex.existsAnyMissingValue() && !allMissing);

        kmeans.copyCenter(ex, numMeans);
      }

      // now, iterate adjusting clusters' centers and
      // instances to them
      prevE = 0;
      iterations = 0;
      do {
        for (int i = 0; i < ndatos; i++) {
          Instance inst = IS.getInstance(i);

          kmeans.setClusterOf(inst, i);
        }
        // set new centers
        kmeans.recalculateCenters(IS);
        // compute RMSE
        E = 0;
        for (int i = 0; i < ndatos; i++) {
          Instance inst = IS.getInstance(i);

          E += kmeans.distance(inst, kmeans.getClusterOf(i));
        }
        iterations++;
        // System.out.println(iterations+"\t"+E);
        if (Math.abs(prevE - E) == 0) iterations = maxIter;
        else prevE = E;
      } while (E > minError && iterations < maxIter);
      for (int i = 0; i < ndatos; i++) {
        Instance inst = IS.getInstance(i);

        in = 0;
        out = 0;

        for (int j = 0; j < nvariables; j++) {
          Attribute a = Attributes.getAttribute(j);

          direccion = a.getDirectionAttribute();
          tipo = a.getType();

          if (direccion == Attribute.INPUT) {
            if (tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in)) {
              X[i][j] = new String(String.valueOf(inst.getInputRealValues(in)));
            } else {
              if (!inst.getInputMissingValues(in)) X[i][j] = inst.getInputNominalValues(in);
              else {
                actual = kmeans.getClusterOf(i);
                X[i][j] = new String(kmeans.valueAt(actual, j));
              }
            }
            in++;
          } else {
            if (direccion == Attribute.OUTPUT) {
              if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)) {
                X[i][j] = new String(String.valueOf(inst.getOutputRealValues(out)));
              } else {
                if (!inst.getOutputMissingValues(out)) X[i][j] = inst.getOutputNominalValues(out);
                else {
                  actual = kmeans.getClusterOf(i);
                  X[i][j] = new String(kmeans.valueAt(actual, j));
                }
              }
              out++;
            }
          }
        }
      }
    } catch (Exception e) {
      System.out.println("Dataset exception = " + e);
      e.printStackTrace();
      System.exit(-1);
    }
    write_results(output_train_name);
    /** ************************************************************************************ */
    // does a test file associated exist?
    if (input_train_name.compareTo(input_test_name) != 0) {
      try {

        // Load in memory a dataset that contains a classification problem
        IStest.readSet(input_test_name, false);
        int in = 0;
        int out = 0;

        ndatos = IStest.getNumInstances();
        nvariables = Attributes.getNumAttributes();
        nentradas = Attributes.getInputNumAttributes();
        nsalidas = Attributes.getOutputNumAttributes();

        for (int i = 0; i < ndatos; i++) {
          Instance inst = IStest.getInstance(i);

          in = 0;
          out = 0;

          for (int j = 0; j < nvariables; j++) {
            Attribute a = Attributes.getAttribute(j);

            direccion = a.getDirectionAttribute();
            tipo = a.getType();

            if (direccion == Attribute.INPUT) {
              if (tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in)) {
                X[i][j] = new String(String.valueOf(inst.getInputRealValues(in)));
              } else {
                if (!inst.getInputMissingValues(in)) X[i][j] = inst.getInputNominalValues(in);
                else {
                  actual = kmeans.getClusterOf(i);
                  X[i][j] = new String(kmeans.valueAt(actual, j));
                }
              }
              in++;
            } else {
              if (direccion == Attribute.OUTPUT) {
                if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)) {
                  X[i][j] = new String(String.valueOf(inst.getOutputRealValues(out)));
                } else {
                  if (!inst.getOutputMissingValues(out)) X[i][j] = inst.getOutputNominalValues(out);
                  else {
                    actual = kmeans.getClusterOf(i);
                    X[i][j] = new String(kmeans.valueAt(actual, j));
                  }
                }
                out++;
              }
            }
          }
        }
      } catch (Exception e) {
        System.out.println("Dataset exception = " + e);
        e.printStackTrace();
        System.exit(-1);
      }
      write_results(output_test_name);
    }
  }
 public Instance pipe(Instance carrier) {
   carrier.setData(new TokenIterator((TokenSequence) carrier.getData()));
   return carrier;
 }
Ejemplo n.º 23
0
  /** Process the training and test files provided in the parameters file to the constructor. */
  public void process() {
    double[] outputs;
    double[] outputs2;
    Instance neighbor;
    double dist, mean;
    int actual;
    int[] N = new int[nneigh];
    double[] Ndist = new double[nneigh];
    boolean allNull;
    svm_problem SVMp = null;
    svm_parameter SVMparam = new svm_parameter();
    svm_model svr = null;
    svm_node SVMn[];
    double[] outputsCandidate = null;
    boolean same = true;
    Vector instancesSelected = new Vector();
    Vector instancesSelected2 = new Vector();

    // SVM PARAMETERS
    SVMparam.C = C;
    SVMparam.cache_size = 10; // 10MB of cache
    SVMparam.degree = degree;
    SVMparam.eps = eps;
    SVMparam.gamma = gamma;
    SVMparam.nr_weight = 0;
    SVMparam.nu = nu;
    SVMparam.p = p;
    SVMparam.shrinking = shrinking;
    SVMparam.probability = 0;
    if (kernelType.compareTo("LINEAR") == 0) {
      SVMparam.kernel_type = svm_parameter.LINEAR;
    } else if (kernelType.compareTo("POLY") == 0) {
      SVMparam.kernel_type = svm_parameter.POLY;
    } else if (kernelType.compareTo("RBF") == 0) {
      SVMparam.kernel_type = svm_parameter.RBF;
    } else if (kernelType.compareTo("SIGMOID") == 0) {
      SVMparam.kernel_type = svm_parameter.SIGMOID;
    }

    SVMparam.svm_type = svm_parameter.EPSILON_SVR;

    try {

      // Load in memory a dataset that contains a classification problem
      IS.readSet(input_train_name, true);
      int in = 0;
      int out = 0;

      ndatos = IS.getNumInstances();
      nvariables = Attributes.getNumAttributes();
      nentradas = Attributes.getInputNumAttributes();
      nsalidas = Attributes.getOutputNumAttributes();

      X = new String[ndatos][2]; // matrix with transformed data

      mostCommon = new String[nvariables];
      SVMp = new svm_problem();
      SVMp.l = ndatos;
      SVMp.y = new double[SVMp.l];
      SVMp.x = new svm_node[SVMp.l][nentradas + 1];
      for (int l = 0; l < SVMp.l; l++) {
        for (int n = 0; n < Attributes.getInputNumAttributes() + 1; n++) {
          SVMp.x[l][n] = new svm_node();
        }
      }

      for (int i = 0; i < ndatos; i++) {
        Instance inst = IS.getInstance(i);

        SVMp.y[i] = inst.getAllOutputValues()[0];
        for (int n = 0; n < Attributes.getInputNumAttributes(); n++) {
          SVMp.x[i][n].index = n;
          SVMp.x[i][n].value = inst.getAllInputValues()[n];
          SVMp.y[i] = inst.getAllOutputValues()[0];
        }
        // end of instance
        SVMp.x[i][nentradas].index = -1;
      }
      if (svm.svm_check_parameter(SVMp, SVMparam) != null) {
        System.out.println("SVM parameter error in training:");
        System.out.println(svm.svm_check_parameter(SVMp, SVMparam));
        System.exit(-1);
      }
      // train the SVM
      if (ndatos > 0) {
        svr = svm.svm_train(SVMp, SVMparam);
      }
      for (int i = 0; i < ndatos; i++) {
        Instance inst = IS.getInstance(i);
        X[i][0] = new String(String.valueOf(inst.getAllOutputValues()[0]));
        //			the values used for regression
        SVMn = new svm_node[Attributes.getInputNumAttributes() + 1];
        for (int n = 0; n < Attributes.getInputNumAttributes(); n++) {
          SVMn[n] = new svm_node();
          SVMn[n].index = n;
          SVMn[n].value = inst.getAllInputValues()[n];
        }
        SVMn[nentradas] = new svm_node();
        SVMn[nentradas].index = -1;
        // pedict the class
        X[i][1] = new String(String.valueOf((svm.svm_predict(svr, SVMn))));
      }
    } catch (Exception e) {
      System.out.println("Dataset exception = " + e);
      e.printStackTrace();
      System.exit(-1);
    }
    write_results(output_train_name);
    /** ************************************************************************************ */
    try {

      // Load in memory a dataset that contains a classification
      // problem
      IS.readSet(input_test_name, false);
      int in = 0;
      int out = 0;

      ndatos = IS.getNumInstances();
      nvariables = Attributes.getNumAttributes();
      nentradas = Attributes.getInputNumAttributes();
      nsalidas = Attributes.getOutputNumAttributes();

      X = new String[ndatos][2]; // matrix with transformed data
      // data

      mostCommon = new String[nvariables];

      for (int i = 0; i < ndatos; i++) {
        Instance inst = IS.getInstance(i);
        X[i][0] = new String(String.valueOf(inst.getAllOutputValues()[0]));

        SVMn = new svm_node[Attributes.getInputNumAttributes() + 1];
        for (int n = 0; n < Attributes.getInputNumAttributes(); n++) {
          SVMn[n] = new svm_node();
          SVMn[n].index = n;
          SVMn[n].value = inst.getAllInputValues()[n];
        }
        SVMn[nentradas] = new svm_node();
        SVMn[nentradas].index = -1;
        // pedict the class
        X[i][1] = new String(String.valueOf(svm.svm_predict(svr, SVMn)));
      }
    } catch (Exception e) {
      System.out.println("Dataset exception = " + e);
      e.printStackTrace();
      System.exit(-1);
    }
    System.out.println("escribiendo test");
    write_results(output_test_name);
  }
Ejemplo n.º 24
0
  /**
   * If we know in advance that the table is numeric, can optimize a lot... For example, on 9803 x
   * 294 table, TableFileLoader.readNumeric takes 6s compared to 12s for WekaMine readFromTable.
   */
  public static Instances readNumeric(String fileName, String relationName, String delimiter)
      throws Exception {

    int numAttributes = FileUtils.fastCountLines(fileName) - 1; // -1 exclude heading.
    String[] attrNames = new String[numAttributes];

    // Read the col headings and figure out the number of columns in the table..
    BufferedReader reader = new BufferedReader(new FileReader(fileName), 4194304);
    String line = reader.readLine();
    String[] instanceNames = parseColNames(line, delimiter);
    int numInstances = instanceNames.length;

    System.err.print("reading " + numAttributes + " x " + numInstances + " table..");

    // Create an array to hold the data as we read it in...
    double dataArray[][] = new double[numAttributes][numInstances];

    // Populate the matrix with values...
    String valToken = "";
    try {
      int rowIdx = 0;
      while ((line = reader.readLine()) != null) {

        String[] tokens = line.split(delimiter, -1);
        attrNames[rowIdx] = tokens[0].trim();
        for (int colIdx = 0; colIdx < (tokens.length - 1); colIdx++) {
          valToken = tokens[colIdx + 1];
          double value;

          if (valToken.equals("null")) {
            value = Instance.missingValue();
          } else if (valToken.equals("?")) {
            value = Instance.missingValue();
          } else if (valToken.equals("NA")) {
            value = Instance.missingValue();
          } else if (valToken.equals("")) {
            value = Instance.missingValue();
            // }else value = DoubleParser.lightningParse(valToken); // faster double parser with
            // MANY assumptions
          } else value = Double.parseDouble(valToken);
          dataArray[rowIdx][colIdx] = value;
        }
        rowIdx++;
      }
    } catch (NumberFormatException e) {
      System.err.println(e.toString());
      System.err.println("Parsing line: " + line);
      System.err.println("Parsing token: " + valToken);
    }

    // Set up attributes, which for colInstances will be the rowNames...
    FastVector atts = new FastVector();
    for (int a = 0; a < numAttributes; a++) {
      atts.addElement(new Attribute(attrNames[a]));
    }

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    System.err.print("creating instances..");

    // System.err.println("DEBUG: numAttributes "+numAttributes);

    /** ***** CREATE INSTANCES ************* */
    // Fill the instances with data...
    // For each instance...
    for (int c = 0; c < numInstances; c++) {
      double[] vals =
          new double[data.numAttributes()]; // Even nominal values are stored as double pointers.

      for (int r = 0; r < numAttributes; r++) {
        double val = dataArray[r][c];
        vals[r] = val;
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    // System.err.println("DEBUG: data.numInstances: "+data.numInstances());
    // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes());
    // System.err.println("DEBUG: data.relationNAme"+data.relationName());
    System.err.print("add feature names..");

    /** ***** ADD FEATURE NAMES ************* */
    // takes basically zero time... all time is in previous 2 chunks.
    Instances newData = new Instances(data);
    newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
    int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

    for (int c = 0; c < numInstances; c++) {
      newData.instance(c).setValue(attrIdx, instanceNames[c]);
    }
    data = newData;

    // System.err.println("DEBUG: data.numInstances: "+data.numInstances());
    // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes());

    return (data);
  }
Ejemplo n.º 25
0
  /**
   * ************************************************** Convert a table to a set of instances, with
   * <b>rows</b> representing individual </b>instances</b> and <b>columns</b> representing
   * <b>attributes</b>
   */
  public Instances tableRowsToNominalInstances(Table t, String relationName) {

    System.err.print("Converting table rows to instances...");

    // Set up attributes, which for rowInstances will be the colNames...
    FastVector atts = new FastVector();
    ArrayList<Boolean> isNominal = new ArrayList<Boolean>();
    ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later...			

    System.err.print("creating attributes...");

    for (int c = 0; c < t.numCols; c++) {
      // It's nominal... determine the range of values
      isNominal.add(true);
      FastVector attVals = getColValues(t, c);
      atts.addElement(new Attribute(t.colNames[c], attVals));
      // Save it for later
      allAttVals.add(attVals);
    }

    System.err.print("creating instances...");

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    // Fill the instances with data...
    // For each instance...
    for (int r = 0; r < t.numRows; r++) {
      double[] vals = new double[data.numAttributes()];

      // for each attribute
      for (int c = 0; c < t.numCols; c++) {
        String val = (String) t.matrix.getQuick(r, c);
        if (val == "?") vals[c] = Instance.missingValue();
        else if (isNominal.get(c)) {
          vals[c] = allAttVals.get(c).indexOf(val);
        } else {
          vals[c] = Double.parseDouble((String) val);
        }
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    System.err.print("add feature names...");

    if (addInstanceNamesAsFeatures) {
      Instances newData = new Instances(data);
      newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
      int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

      // We save the instanceNames in a list because it's handy later on...
      instanceNames = new ArrayList<String>();

      for (int r = 0; r < t.rowNames.length; r++) {
        instanceNames.add(t.rowNames[r]);
        newData.instance(r).setValue(attrIdx, t.rowNames[r]);
      }
      data = newData;
    }

    System.err.println("done.");

    return (data);
  }
Ejemplo n.º 26
0
  /** @param args */
  public static void main(final String[] args) throws Exception {
    final BufferedReader br =
        new BufferedReader(
            new InputStreamReader(
                (new GZIPInputStream(
                    new FileInputStream(
                        new File(
                            new File(System.getProperty("user.dir")), "testdata/mobo1.txt.gz"))))));

    final List<Instance> instances = Lists.newLinkedList();

    int count = 0;
    while (true) {
      count++;
      final String line = br.readLine();
      if (line == null) {
        break;
      }
      final JSONObject jo = (JSONObject) JSONValue.parse(line);
      final HashMapAttributes a = new HashMapAttributes();
      a.putAll((JSONObject) jo.get("attributes"));
      Instance instance = new Instance(a, (String) jo.get("output"));
      instances.add(instance);
    }

    final List<Instance> train = instances.subList(0, instances.size() / 2);
    final List<Instance> test = instances.subList(instances.size() / 2 + 1, instances.size() - 1);

    System.out.println("Read " + instances.size() + " instances");

    System.out.println("Testing scorers with single decision node");
    for (final Scorer scorer : Sets.newHashSet(new Scorer1())) {
      final TreeBuilder tb = new TreeBuilder(scorer);

      final long startTime = System.currentTimeMillis();
      final Node tree = tb.buildPredictiveModel(train).node;
      System.out.println(
          scorer.getClass().getSimpleName()
              + " build time "
              + (System.currentTimeMillis() - startTime)
              + ", size: "
              + tree.size()
              + " mean depth: "
              + tree.meanDepth());

      int correctlyClassified = 0;
      for (Instance testInstance : test) {
        String result = (String) tree.getLeaf(testInstance.getAttributes()).getBestClassification();
        if (result.equals(testInstance.getClassification())) {
          correctlyClassified++;
        }
      }
      System.out.println(", accuracy: " + (double) correctlyClassified / test.size());

      System.out.println("Testing random forest");

      for (int i = 2; i <= 20; i++) {
        RandomForestBuilder rfBuilder = new RandomForestBuilder(new TreeBuilder());
        RandomForest randomForest = rfBuilder.buildPredictiveModel(train);
        correctlyClassified = 0;
        for (Instance testInstance : test) {
          Serializable result =
              randomForest.getClassificationByMaxProb(testInstance.getAttributes());
          if (result.equals(testInstance.getClassification())) {
            correctlyClassified++;
          }
        }
        System.out.println(
            "accuracy with " + i + " trees: " + (double) correctlyClassified / test.size());
        // ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(new
        // File("baggedTree.ser")));
        // out.writeObject(baggedTree);
      }
    }
  }
Ejemplo n.º 27
0
  /**
   * ************************************************** Convert a table to a set of instances, with
   * <b>columns</b> representing individual </b>instances</b> and <b>rows</b> representing
   * <b>attributes</b> (e.g. as is common with microarray data)
   */
  public Instances tableColsToInstances(Table t, String relationName) {

    System.err.print("Converting table cols to instances...");

    // Set up attributes, which for colInstances will be the rowNames...
    FastVector atts = new FastVector();
    ArrayList<Boolean> isNominal = new ArrayList<Boolean>();
    ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later...

    System.err.print("creating attributes...");

    for (int r = 0; r < t.numRows; r++) {
      if (rowIsNumeric(t, r)) {
        isNominal.add(false);
        atts.addElement(new Attribute(t.rowNames[r]));
        allAttVals.add(null); // No enumeration of attribute values.
      } else {
        // It's nominal... determine the range of values and create a nominal attribute...
        isNominal.add(true);
        FastVector attVals = getRowValues(t, r);
        atts.addElement(new Attribute(t.rowNames[r], attVals));
        // Save it for later
        allAttVals.add(attVals);
      }
    }

    System.err.print("creating instances...");

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    /** ***** CREATE INSTANCES ************* */
    // Fill the instances with data...
    // For each instance...
    for (int c = 0; c < t.numCols; c++) {
      double[] vals =
          new double[data.numAttributes()]; // Even nominal values are stored as double pointers.

      // For each attribute fill in the numeric or attributeValue index...
      for (int r = 0; r < t.numRows; r++) {
        String val = (String) t.matrix.getQuick(r, c);
        if (val == "?") vals[r] = Instance.missingValue();
        else if (isNominal.get(r)) {
          vals[r] = allAttVals.get(r).indexOf(val);
        } else {
          vals[r] = Double.parseDouble((String) val);
        }
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    System.err.print("add feature names...");

    /** ***** ADD FEATURE NAMES ************* */
    // takes basically zero time... all time is in previous 2 chunks.
    if (addInstanceNamesAsFeatures) {
      Instances newData = new Instances(data);
      newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
      int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

      // We save the instanceNames in a list because it's handy later on...
      instanceNames = new ArrayList<String>();

      for (int c = 0; c < t.colNames.length; c++) {
        instanceNames.add(t.colNames[c]);
        newData.instance(c).setValue(attrIdx, t.colNames[c]);
      }
      data = newData;
    }

    System.err.println("done.");

    return (data);
  }
Ejemplo n.º 28
0
  public JSONArray Cluster(String wekaFilePath, int clusterNum) throws Exception {
    File inputFile = new File(wekaFilePath);
    ArffLoader arf = new ArffLoader();
    arf.setFile(inputFile);
    Instances originIns = arf.getDataSet();
    Instances insTest = new Instances(originIns);
    insTest.deleteStringAttributes();
    int totalNum = insTest.numInstances();

    // SimpleKMeans sm = new SimpleKMeans();
    EM em = new EM();
    em.setNumClusters(clusterNum);
    MakeDensityBasedClusterer sm = new MakeDensityBasedClusterer();
    sm.setClusterer(em);
    sm.buildClusterer(insTest);

    System.out.println("totalNum:" + insTest.numInstances());
    System.out.println("============================");
    System.out.println(sm.toString());
    Map<Integer, ArrayList<String>> result = new HashMap<Integer, ArrayList<String>>();
    for (int i = 0; i < clusterNum; i++) {
      result.put(i, new ArrayList<String>());
    }

    for (int i = 0; i < totalNum; i++) {
      Instance ins = originIns.instance(i);
      String word = ins.stringValue(0);
      Instance tempIns = new Instance(ins);
      tempIns.deleteAttributeAt(0);
      int cluster = sm.clusterInstance(tempIns);
      result.get(cluster).add(word);
    }

    // print the result
    ArrayList<String> words = new ArrayList<String>();
    JSONArray keyWords = new JSONArray();
    for (int k : result.keySet()) {
      words = result.get(k);
      PriorityQueue<MyTerm> clusterQueue = new PriorityQueue<MyTerm>(1, MyTermCompare);
      for (int i = 0; i < words.size(); i++) {
        String s = words.get(i);
        assert linkMap.containsKey(s);
        int freq = linkMap.get(s).totalFreq;
        clusterQueue.add(linkMap.get(s));
        words.set(i, "(" + s + ":" + freq + ")");
      }

      JSONArray clusterArray = new JSONArray();
      int num = clusterQueue.size() / 10 + 1; // 5%
      int totalFreq = 0;
      int totalLength = 0;
      for (int i = 0; i < num && !clusterQueue.isEmpty(); ) {
        JSONObject mem = new JSONObject();
        MyTerm myTerm = clusterQueue.poll();
        String word = myTerm.originTrem.text();
        if (word.length() == 1) {
          continue;
        }
        mem.put("text", word);
        mem.put("freq", myTerm.totalFreq);
        clusterArray.put(mem);
        i++;
        totalFreq += myTerm.totalFreq;
        totalLength += word.length();
      }

      double averFreq = totalFreq * 1.0 / num;
      double averLength = totalLength * 1.0 / num;
      int count = 0;
      while (!clusterQueue.isEmpty() && count < num) {
        MyTerm myTerm = clusterQueue.poll();
        String word = myTerm.originTrem.text();
        int freq = myTerm.totalFreq;
        int times = (int) (word.length() / averFreq) + 1;
        if (freq > averFreq / times) {
          JSONObject mem = new JSONObject();
          mem.put("text", word);
          mem.put("freq", freq);
          mem.put("extra", true);
          clusterArray.put(mem);
        }
      }

      keyWords.put(clusterArray);
      System.out.println(
          "cluster" + k + ":" + words.size() + ":\t" + (int) (words.size() * 1.0 / totalNum * 100));
      if (result.get(k).size() < 100) {
        System.out.println(result.get(k));
      }
    }
    // System.out.println("errorNum:"+errorNum);
    return keyWords;
  }
Ejemplo n.º 29
0
  /**
   * Process a dataset file for a classification problem.
   *
   * @param nfejemplos Name of the dataset file
   * @param train The dataset file is for training or for test
   * @throws java.io.IOException if there is any semantical, lexical or sintactical error in the
   *     input file.
   */
  public void processClassifierDataset(String nfejemplos, boolean train) throws IOException {

    try {

      // Load in memory a dataset that contains a classification problem
      IS.readSet(nfejemplos, train);

      nData = IS.getNumInstances();
      nInputs = Attributes.getInputNumAttributes();
      nVariables = nInputs + Attributes.getOutputNumAttributes();

      // Check that there is only one output variable and
      // it is nominal

      if (Attributes.getOutputNumAttributes() > 1) {
        System.out.println("This algorithm can not process MIMO datasets");
        System.out.println("All outputs but the first one will be removed");
      }

      boolean noOutputs = false;
      if (Attributes.getOutputNumAttributes() < 1) {
        System.out.println("This algorithm can not process datasets without outputs");
        System.out.println("Zero-valued output generated");
        noOutputs = true;
      }

      // Initialize and fill our own tables
      X = new double[nData][nInputs];
      missing = new boolean[nData][nInputs];
      C = new int[nData];

      // Maximum and minimum of inputs
      iMaximum = new double[nInputs];
      iMinimum = new double[nInputs];

      // Maximum and minimum for output data
      oMaximum = 0;
      oMinimum = 0;

      // All values are casted into double/integer
      nClasses = 0;
      for (int i = 0; i < X.length; i++) {
        Instance inst = IS.getInstance(i);
        for (int j = 0; j < nInputs; j++) {
          X[i][j] = IS.getInputNumericValue(i, j);
          missing[i][j] = inst.getInputMissingValues(j);
          if (X[i][j] > iMaximum[j] || i == 0) {
            iMaximum[j] = X[i][j];
          }
          if (X[i][j] < iMinimum[j] || i == 0) {
            iMinimum[j] = X[i][j];
          }
        }

        if (noOutputs) {
          C[i] = 0;
        } else {
          C[i] = (int) IS.getOutputNumericValue(i, 0);
        }
        if (C[i] > nClasses) {
          nClasses = C[i];
        }
      }
      nClasses++;
      System.out.println("Number of classes=" + nClasses);

    } catch (Exception e) {
      System.out.println("DBG: Exception in readSet");
      e.printStackTrace();
    }
  }
Ejemplo n.º 30
0
  /** Process the training and test files provided in the parameters file to the constructor. */
  public void process() {
    double[] outputs;
    double[] outputs2;
    try {
      FileWriter file_write = new FileWriter(output_train_name);

      try {

        // Load in memory a dataset that contains a classification problem
        IS.readSet(input_train_name, true);
        int in = 0;
        int out = 0;
        int in2 = 0;
        int out2 = 0;
        int lastMissing = -1;
        boolean fin = false;
        boolean stepNext = false;

        ndatos = IS.getNumInstances();
        nvariables = Attributes.getNumAttributes();
        nentradas = Attributes.getInputNumAttributes();
        nsalidas = Attributes.getOutputNumAttributes();

        String[] row = null;
        X = new Vector[ndatos]; // matrix with transformed data
        for (int i = 0; i < ndatos; i++) X[i] = new Vector();

        timesSeen = new FreqList[nvariables];
        mostCommon = new String[nvariables];

        file_write.write(IS.getHeader());

        // now, print the normalized data
        file_write.write("@data\n");

        // now, search for missed data, and replace them with
        // the most common value

        for (int i = 0; i < ndatos; i++) {
          Instance inst = IS.getInstance(i);
          in = 0;
          out = 0;
          row = new String[nvariables];

          for (int j = 0; j < nvariables; j++) {
            Attribute a = Attributes.getAttribute(j);

            direccion = a.getDirectionAttribute();
            tipo = a.getType();

            if (direccion == Attribute.INPUT) {
              if (tipo != Attribute.NOMINAL && !inst.existsAnyMissingValue()) {
                row[j] = new String(String.valueOf(inst.getInputRealValues(in)));
              } else {
                if (!inst.existsAnyMissingValue()) row[j] = inst.getInputNominalValues(in);
                else {
                  // missing data
                  outputs = inst.getAllOutputValues();
                  in2 = 0;
                  out2 = 0;
                  for (int attr = 0; attr < nvariables; attr++) {
                    Attribute b = Attributes.getAttribute(attr);
                    direccion = b.getDirectionAttribute();
                    tipo = b.getType();
                    if (direccion == Attribute.INPUT) {
                      if (tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in2)) {
                        row[attr] = new String(String.valueOf(inst.getInputRealValues(in2)));
                      } else {
                        if (!inst.getInputMissingValues(in2))
                          row[attr] = inst.getInputNominalValues(in2);
                      }
                      in2++;
                    } else {
                      if (direccion == Attribute.OUTPUT) {
                        if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out2)) {
                          row[attr] = new String(String.valueOf(inst.getOutputRealValues(out2)));
                        } else {
                          if (!inst.getOutputMissingValues(out2))
                            row[attr] = inst.getOutputNominalValues(out2);
                        }
                        out2++;
                      }
                    }
                  }
                  // make frecuencies  for each attribute
                  for (int attr = 0; attr < nvariables; attr++) {
                    Attribute b = Attributes.getAttribute(attr);

                    direccion = b.getDirectionAttribute();
                    tipo = b.getType();
                    if (direccion == Attribute.INPUT && inst.getInputMissingValues(attr)) {
                      lastMissing = attr;
                      timesSeen[attr] = new FreqList();
                      for (int m = 0; m < ndatos; m++) {
                        Instance inst2 = IS.getInstance(m);
                        outputs2 = inst2.getAllOutputValues();
                        boolean sameClass = true;
                        // are they same concept instances??
                        for (int k = 0; k < nsalidas && sameClass; k++)
                          if (outputs[k] != outputs2[k]) sameClass = false;
                        if (sameClass) {
                          if (tipo != Attribute.NOMINAL && !inst2.getInputMissingValues(attr)) {
                            timesSeen[attr].AddElement(
                                new String(String.valueOf(inst2.getInputRealValues(attr))));

                          } else {
                            if (!inst2.getInputMissingValues(attr)) {
                              timesSeen[attr].AddElement(inst2.getInputNominalValues(attr));
                            }
                          }
                        }
                      }
                    }
                  }
                  for (int attr = 0; attr < nvariables; attr++) {
                    if (direccion == Attribute.INPUT && inst.getInputMissingValues(attr)) {
                      timesSeen[attr].reset();
                    }
                  }
                  fin = false;
                  stepNext = false;
                  while (!fin) {
                    in2 = 0;
                    for (int attr = 0; attr < nvariables && !fin; attr++) {
                      Attribute b = Attributes.getAttribute(attr);

                      direccion = b.getDirectionAttribute();
                      tipo = b.getType();
                      if (direccion == Attribute.INPUT && inst.getInputMissingValues(in2)) {
                        if (stepNext) {
                          timesSeen[attr].iterate();
                          stepNext = false;
                        }
                        if (timesSeen[attr].outOfBounds()) {
                          stepNext = true;
                          if (attr == lastMissing) fin = true;
                          timesSeen[attr].reset();
                        }
                        if (!fin)
                          row[attr] =
                              ((ValueFreq) timesSeen[attr].getCurrent())
                                  .getValue(); // replace missing data
                      }
                      in2++;
                    }
                    if (!fin) {
                      stepNext = true;
                      file_write.write(row[0]);
                      for (int y = 1; y < nvariables; y++) {
                        file_write.write("," + row[y]);
                      }
                      file_write.write("\n");
                      // X[i].addElement(row);
                      // row = (String[])row.clone();
                    }
                  }
                }
              }
              in++;
            } else {
              if (direccion == Attribute.OUTPUT) {
                if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)) {
                  row[j] = new String(String.valueOf(inst.getOutputRealValues(out)));
                } else {
                  if (!inst.getOutputMissingValues(out)) row[j] = inst.getOutputNominalValues(out);
                  else row[j] = new String("?");
                }
                out++;
              }
            }
          }
          if (!inst.existsAnyMissingValue()) {
            file_write.write(row[0]);
            for (int y = 1; y < nvariables; y++) {
              file_write.write("," + row[y]);
            }
            file_write.write("\n");
          }
        }
      } catch (Exception e) {
        System.out.println("Dataset exception = " + e);
        e.printStackTrace();
        System.exit(-1);
      }
      file_write.close();
    } catch (IOException e) {
      System.out.println("IO exception = " + e);
      e.printStackTrace();
      System.exit(-1);
    }

    /** ************************************************************************************ */
    // does a test file associated exist?
    if (input_train_name.compareTo(input_test_name) != 0) {
      try {
        FileWriter file_write = new FileWriter(output_test_name);

        try {

          // Load in memory a dataset that contains a classification problem
          IS.readSet(input_test_name, false);
          int in = 0;
          int out = 0;
          int in2 = 0;
          int out2 = 0;
          int lastMissing = -1;
          boolean fin = false;
          boolean stepNext = false;

          ndatos = IS.getNumInstances();
          nvariables = Attributes.getNumAttributes();
          nentradas = Attributes.getInputNumAttributes();
          nsalidas = Attributes.getOutputNumAttributes();

          String[] row = null;
          X = new Vector[ndatos]; // matrix with transformed data
          for (int i = 0; i < ndatos; i++) X[i] = new Vector();

          timesSeen = new FreqList[nvariables];
          mostCommon = new String[nvariables];

          file_write.write(IS.getHeader());

          // now, print the normalized data
          file_write.write("@data\n");

          // now, search for missed data, and replace them with
          // the most common value

          for (int i = 0; i < ndatos; i++) {
            Instance inst = IS.getInstance(i);
            in = 0;
            out = 0;
            row = new String[nvariables];

            for (int j = 0; j < nvariables; j++) {
              Attribute a = Attributes.getAttribute(j);

              direccion = a.getDirectionAttribute();
              tipo = a.getType();

              if (direccion == Attribute.INPUT) {
                if (tipo != Attribute.NOMINAL && !inst.existsAnyMissingValue()) {
                  row[j] = new String(String.valueOf(inst.getInputRealValues(in)));
                } else {
                  if (!inst.existsAnyMissingValue()) row[j] = inst.getInputNominalValues(in);
                  else {
                    // missing data
                    outputs = inst.getAllOutputValues();
                    in2 = 0;
                    out2 = 0;
                    for (int attr = 0; attr < nvariables; attr++) {
                      Attribute b = Attributes.getAttribute(attr);
                      direccion = b.getDirectionAttribute();
                      tipo = b.getType();
                      if (direccion == Attribute.INPUT) {
                        if (tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in2)) {
                          row[attr] = new String(String.valueOf(inst.getInputRealValues(in2)));
                        } else {
                          if (!inst.getInputMissingValues(in2))
                            row[attr] = inst.getInputNominalValues(in2);
                        }
                        in2++;
                      } else {
                        if (direccion == Attribute.OUTPUT) {
                          if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out2)) {
                            row[attr] = new String(String.valueOf(inst.getOutputRealValues(out2)));
                          } else {
                            if (!inst.getOutputMissingValues(out2))
                              row[attr] = inst.getOutputNominalValues(out2);
                          }
                          out2++;
                        }
                      }
                    }
                    // make frecuencies  for each attribute
                    for (int attr = 0; attr < nvariables; attr++) {
                      Attribute b = Attributes.getAttribute(attr);

                      direccion = b.getDirectionAttribute();
                      tipo = b.getType();
                      if (direccion == Attribute.INPUT && inst.getInputMissingValues(attr)) {
                        lastMissing = attr;
                        timesSeen[attr] = new FreqList();
                        for (int m = 0; m < ndatos; m++) {
                          Instance inst2 = IS.getInstance(m);
                          outputs2 = inst2.getAllOutputValues();
                          boolean sameClass = true;
                          // are they same concept instances??
                          for (int k = 0; k < nsalidas && sameClass; k++)
                            if (outputs[k] != outputs2[k]) sameClass = false;
                          if (sameClass) {
                            if (tipo != Attribute.NOMINAL && !inst2.getInputMissingValues(attr)) {
                              timesSeen[attr].AddElement(
                                  new String(String.valueOf(inst2.getInputRealValues(attr))));

                            } else {
                              if (!inst2.getInputMissingValues(attr)) {
                                timesSeen[attr].AddElement(inst2.getInputNominalValues(attr));
                              }
                            }
                          }
                        }
                      }
                    }
                    for (int attr = 0; attr < nvariables; attr++) {
                      if (direccion == Attribute.INPUT && inst.getInputMissingValues(attr)) {
                        timesSeen[attr].reset();
                      }
                    }
                    fin = false;
                    stepNext = false;
                    while (!fin) {
                      in2 = 0;
                      for (int attr = 0; attr < nvariables && !fin; attr++) {
                        Attribute b = Attributes.getAttribute(attr);

                        direccion = b.getDirectionAttribute();
                        tipo = b.getType();
                        if (direccion == Attribute.INPUT && inst.getInputMissingValues(in2)) {
                          if (stepNext) {
                            timesSeen[attr].iterate();
                            stepNext = false;
                          }
                          if (timesSeen[attr].outOfBounds()) {
                            stepNext = true;
                            if (attr == lastMissing) fin = true;
                            timesSeen[attr].reset();
                          }
                          if (!fin)
                            row[attr] =
                                ((ValueFreq) timesSeen[attr].getCurrent())
                                    .getValue(); // replace missing data
                        }
                        in2++;
                      }
                      if (!fin) {
                        stepNext = true;
                        file_write.write(row[0]);
                        for (int y = 1; y < nvariables; y++) {
                          file_write.write("," + row[y]);
                        }
                        file_write.write("\n");
                        // X[i].addElement(row);
                        // row = (String[])row.clone();
                      }
                    }
                  }
                }
                in++;
              } else {
                if (direccion == Attribute.OUTPUT) {
                  if (tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)) {
                    row[j] = new String(String.valueOf(inst.getOutputRealValues(out)));
                  } else {
                    if (!inst.getOutputMissingValues(out))
                      row[j] = inst.getOutputNominalValues(out);
                    else row[j] = new String("?");
                  }
                  out++;
                }
              }
            }
            if (!inst.existsAnyMissingValue()) {
              file_write.write(row[0]);
              for (int y = 1; y < nvariables; y++) {
                file_write.write("," + row[y]);
              }
              file_write.write("\n");
            }
          }
        } catch (Exception e) {
          System.out.println("Dataset exception = " + e);
          e.printStackTrace();
          System.exit(-1);
        }
        file_write.close();
      } catch (IOException e) {
        System.out.println("IO exception = " + e);
        e.printStackTrace();
        System.exit(-1);
      }
    }
  }