Exemple #1
0
  /**
   * Generate artificial training examples.
   *
   * @param artSize size of examples set to create
   * @param data training data
   * @return the set of unlabeled artificial examples
   */
  protected Instances generateArtificialData(int artSize, Instances data) {
    int numAttributes = data.numAttributes();
    Instances artData = new Instances(data, artSize);
    double[] att;
    Instance artInstance;

    for (int i = 0; i < artSize; i++) {
      att = new double[numAttributes];
      for (int j = 0; j < numAttributes; j++) {
        if (data.attribute(j).isNominal()) {
          // Select nominal value based on the frequency of occurence in the training data
          double[] stats = (double[]) m_AttributeStats.get(j);
          att[j] = (double) selectIndexProbabilistically(stats);
        } else if (data.attribute(j).isNumeric()) {
          // Generate numeric value from the Guassian distribution
          // defined by the mean and std dev of the attribute
          double[] stats = (double[]) m_AttributeStats.get(j);
          att[j] = (m_Random.nextGaussian() * stats[1]) + stats[0];
        } else System.err.println("Decorate can only handle numeric and nominal values.");
      }
      artInstance = new Instance(1.0, att);
      artData.add(artInstance);
    }
    return artData;
  }
Exemple #2
0
  // 构造一个tri-trainer分类器。
  public Tritrainer(
      String classifier, String trainingIns_File, String testIns_File, double precentage) {
    try {
      this.classifier1 = (Classifier) Class.forName(classifier).newInstance();
      this.classifier2 = (Classifier) Class.forName(classifier).newInstance();
      this.classifier3 = (Classifier) Class.forName(classifier).newInstance();

      Instances trainingInstances = Util.getInstances(trainingIns_File);

      // 将trainIns_File按照precentage和(1-precentage)的比例切割成labeledIns和unlabeledIns;
      int length = trainingInstances.numInstances();
      int i = new Double(length * precentage).intValue();
      labeledIns = new Instances(trainingInstances, 0);
      for (int j = 0; j < i; j++) {
        labeledIns.add(trainingInstances.firstInstance());
        trainingInstances.delete(0);
      }
      unlabeledIns = trainingInstances;
      testIns = Util.getInstances(testIns_File);

      Init();
    } catch (Exception e) {

    }
  }
Exemple #3
0
  /**
   * Generates a clusterer. Has to initialize all fields of the clusterer that are not being set via
   * options.
   *
   * @param data set of instances serving as training data
   * @exception Exception if the clusterer has not been generated successfully
   */
  public void buildClusterer(Instances data) throws Exception {

    // long start = System.currentTimeMillis();
    if (data.checkForStringAttributes()) {
      throw new Exception("Can't handle string attributes!");
    }

    m_ReplaceMissingFilter = new ReplaceMissingValues();
    m_ReplaceMissingFilter.setInputFormat(data);
    m_instances = Filter.useFilter(data, m_ReplaceMissingFilter);

    initMinMax(m_instances);

    m_ClusterCentroids = new Instances(m_instances, m_NumClusters);

    int n = m_instances.numInstances();
    Random r = new Random(m_Seed);
    boolean[] selected = new boolean[n];
    double[] minDistance = new double[n];

    for (int i = 0; i < n; i++) minDistance[i] = Double.MAX_VALUE;

    int firstI = r.nextInt(n);
    m_ClusterCentroids.add(m_instances.instance(firstI));
    selected[firstI] = true;

    updateMinDistance(minDistance, selected, m_instances, m_instances.instance(firstI));

    if (m_NumClusters > n) m_NumClusters = n;

    for (int i = 1; i < m_NumClusters; i++) {
      int nextI = farthestAway(minDistance, selected);
      m_ClusterCentroids.add(m_instances.instance(nextI));
      selected[nextI] = true;
      updateMinDistance(minDistance, selected, m_instances, m_instances.instance(nextI));
    }

    m_instances = new Instances(m_instances, 0);
    // long end = System.currentTimeMillis();
    // System.out.println("Clustering Time = " + (end-start));
  }
Exemple #4
0
 // 通过h1,h2分类器学习样本集,将h1,h2分类决策相同的样本放入L中,得到标记集合;
 public void updateL(Classifier h1, Classifier h2, Instances L, Instances test) {
   int length = unlabeledIns.numInstances();
   double value1 = 0.0, value2 = 0.0;
   try {
     for (int i = 0; i < length; i++) {
       value1 = h1.classifyInstance(test.instance(i));
       value2 = h2.classifyInstance(test.instance(i));
       if (value1 == value2) {
         // 当两个分类器做出相同决策时重新标记样本的类别;
         test.instance(i).setClassValue(value1);
         L.add(test.instance(i));
       }
     }
   } catch (Exception e) {
     System.out.println(e);
   }
   // return false;
 }
  public void buildClusterer(ArrayList<String> seqDB, double[][] sm) {
    seqList = seqDB;

    this.setSimMatrix(sm);

    Attribute seqString = new Attribute("sequence", (FastVector) null);
    FastVector attrInfo = new FastVector();
    attrInfo.addElement(seqString);
    Instances data = new Instances("data", attrInfo, 0);

    for (int i = 0; i < seqList.size(); i++) {
      Instance currentInst = new Instance(1);
      currentInst.setDataset(data);
      currentInst.setValue(0, seqList.get(i));
      data.add(currentInst);
    }

    try {
      buildClusterer(data);
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
Exemple #6
0
 /**
  * Add new instances to the given set of instances.
  *
  * @param data given instances
  * @param newData set of instances to add to given instances
  */
 protected void addInstances(Instances data, Instances newData) {
   for (int i = 0; i < newData.numInstances(); i++) data.add(newData.instance(i));
 }
  /**
   * ************************************************** Convert a table to a set of instances, with
   * <b>columns</b> representing individual </b>instances</b> and <b>rows</b> representing
   * <b>attributes</b> (e.g. as is common with microarray data)
   */
  public Instances tableColsToInstances(Table t, String relationName) {

    System.err.print("Converting table cols to instances...");

    // Set up attributes, which for colInstances will be the rowNames...
    FastVector atts = new FastVector();
    ArrayList<Boolean> isNominal = new ArrayList<Boolean>();
    ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later...

    System.err.print("creating attributes...");

    for (int r = 0; r < t.numRows; r++) {
      if (rowIsNumeric(t, r)) {
        isNominal.add(false);
        atts.addElement(new Attribute(t.rowNames[r]));
        allAttVals.add(null); // No enumeration of attribute values.
      } else {
        // It's nominal... determine the range of values and create a nominal attribute...
        isNominal.add(true);
        FastVector attVals = getRowValues(t, r);
        atts.addElement(new Attribute(t.rowNames[r], attVals));
        // Save it for later
        allAttVals.add(attVals);
      }
    }

    System.err.print("creating instances...");

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    /** ***** CREATE INSTANCES ************* */
    // Fill the instances with data...
    // For each instance...
    for (int c = 0; c < t.numCols; c++) {
      double[] vals =
          new double[data.numAttributes()]; // Even nominal values are stored as double pointers.

      // For each attribute fill in the numeric or attributeValue index...
      for (int r = 0; r < t.numRows; r++) {
        String val = (String) t.matrix.getQuick(r, c);
        if (val == "?") vals[r] = Instance.missingValue();
        else if (isNominal.get(r)) {
          vals[r] = allAttVals.get(r).indexOf(val);
        } else {
          vals[r] = Double.parseDouble((String) val);
        }
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    System.err.print("add feature names...");

    /** ***** ADD FEATURE NAMES ************* */
    // takes basically zero time... all time is in previous 2 chunks.
    if (addInstanceNamesAsFeatures) {
      Instances newData = new Instances(data);
      newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
      int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

      // We save the instanceNames in a list because it's handy later on...
      instanceNames = new ArrayList<String>();

      for (int c = 0; c < t.colNames.length; c++) {
        instanceNames.add(t.colNames[c]);
        newData.instance(c).setValue(attrIdx, t.colNames[c]);
      }
      data = newData;
    }

    System.err.println("done.");

    return (data);
  }
  /**
   * ************************************************** Convert a table to a set of instances, with
   * <b>rows</b> representing individual </b>instances</b> and <b>columns</b> representing
   * <b>attributes</b>
   */
  public Instances tableRowsToNominalInstances(Table t, String relationName) {

    System.err.print("Converting table rows to instances...");

    // Set up attributes, which for rowInstances will be the colNames...
    FastVector atts = new FastVector();
    ArrayList<Boolean> isNominal = new ArrayList<Boolean>();
    ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later...			

    System.err.print("creating attributes...");

    for (int c = 0; c < t.numCols; c++) {
      // It's nominal... determine the range of values
      isNominal.add(true);
      FastVector attVals = getColValues(t, c);
      atts.addElement(new Attribute(t.colNames[c], attVals));
      // Save it for later
      allAttVals.add(attVals);
    }

    System.err.print("creating instances...");

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    // Fill the instances with data...
    // For each instance...
    for (int r = 0; r < t.numRows; r++) {
      double[] vals = new double[data.numAttributes()];

      // for each attribute
      for (int c = 0; c < t.numCols; c++) {
        String val = (String) t.matrix.getQuick(r, c);
        if (val == "?") vals[c] = Instance.missingValue();
        else if (isNominal.get(c)) {
          vals[c] = allAttVals.get(c).indexOf(val);
        } else {
          vals[c] = Double.parseDouble((String) val);
        }
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    System.err.print("add feature names...");

    if (addInstanceNamesAsFeatures) {
      Instances newData = new Instances(data);
      newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
      int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

      // We save the instanceNames in a list because it's handy later on...
      instanceNames = new ArrayList<String>();

      for (int r = 0; r < t.rowNames.length; r++) {
        instanceNames.add(t.rowNames[r]);
        newData.instance(r).setValue(attrIdx, t.rowNames[r]);
      }
      data = newData;
    }

    System.err.println("done.");

    return (data);
  }
  /**
   * If we know in advance that the table is numeric, can optimize a lot... For example, on 9803 x
   * 294 table, TableFileLoader.readNumeric takes 6s compared to 12s for WekaMine readFromTable.
   */
  public static Instances readNumeric(String fileName, String relationName, String delimiter)
      throws Exception {

    int numAttributes = FileUtils.fastCountLines(fileName) - 1; // -1 exclude heading.
    String[] attrNames = new String[numAttributes];

    // Read the col headings and figure out the number of columns in the table..
    BufferedReader reader = new BufferedReader(new FileReader(fileName), 4194304);
    String line = reader.readLine();
    String[] instanceNames = parseColNames(line, delimiter);
    int numInstances = instanceNames.length;

    System.err.print("reading " + numAttributes + " x " + numInstances + " table..");

    // Create an array to hold the data as we read it in...
    double dataArray[][] = new double[numAttributes][numInstances];

    // Populate the matrix with values...
    String valToken = "";
    try {
      int rowIdx = 0;
      while ((line = reader.readLine()) != null) {

        String[] tokens = line.split(delimiter, -1);
        attrNames[rowIdx] = tokens[0].trim();
        for (int colIdx = 0; colIdx < (tokens.length - 1); colIdx++) {
          valToken = tokens[colIdx + 1];
          double value;

          if (valToken.equals("null")) {
            value = Instance.missingValue();
          } else if (valToken.equals("?")) {
            value = Instance.missingValue();
          } else if (valToken.equals("NA")) {
            value = Instance.missingValue();
          } else if (valToken.equals("")) {
            value = Instance.missingValue();
            // }else value = DoubleParser.lightningParse(valToken); // faster double parser with
            // MANY assumptions
          } else value = Double.parseDouble(valToken);
          dataArray[rowIdx][colIdx] = value;
        }
        rowIdx++;
      }
    } catch (NumberFormatException e) {
      System.err.println(e.toString());
      System.err.println("Parsing line: " + line);
      System.err.println("Parsing token: " + valToken);
    }

    // Set up attributes, which for colInstances will be the rowNames...
    FastVector atts = new FastVector();
    for (int a = 0; a < numAttributes; a++) {
      atts.addElement(new Attribute(attrNames[a]));
    }

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    System.err.print("creating instances..");

    // System.err.println("DEBUG: numAttributes "+numAttributes);

    /** ***** CREATE INSTANCES ************* */
    // Fill the instances with data...
    // For each instance...
    for (int c = 0; c < numInstances; c++) {
      double[] vals =
          new double[data.numAttributes()]; // Even nominal values are stored as double pointers.

      for (int r = 0; r < numAttributes; r++) {
        double val = dataArray[r][c];
        vals[r] = val;
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    // System.err.println("DEBUG: data.numInstances: "+data.numInstances());
    // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes());
    // System.err.println("DEBUG: data.relationNAme"+data.relationName());
    System.err.print("add feature names..");

    /** ***** ADD FEATURE NAMES ************* */
    // takes basically zero time... all time is in previous 2 chunks.
    Instances newData = new Instances(data);
    newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
    int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

    for (int c = 0; c < numInstances; c++) {
      newData.instance(c).setValue(attrIdx, instanceNames[c]);
    }
    data = newData;

    // System.err.println("DEBUG: data.numInstances: "+data.numInstances());
    // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes());

    return (data);
  }