protected void searchMedian(Instances instances) {
    medians = new double[instances.numAttributes()];
    imputations = new int[instances.numAttributes()];

    for (int j = 0; j < instances.numAttributes(); ++j) {
      int numPresentValues = 0;
      if (instances.attribute(j).isNumeric()) {
        double[] values = new double[instances.numInstances()];
        for (int i = 0; i < instances.numInstances(); ++i) {
          Instance current = instances.get(i);
          if (Utils.isMissingValue(current.value(j)) == false) {
            values[numPresentValues] = current.value(j);
            numPresentValues += 1;
          }
        }
        if (numPresentValues > 0) {
          double[] goodValues = Arrays.copyOf(values, numPresentValues);
          Median median = new Median();
          medians[j] = median.evaluate(goodValues);
        }
      }
    }

    for (int j = 0; j < instances.numAttributes(); ++j) {
      if (instances.attribute(j).isNumeric()) {
        Conversion.log(
            "OK",
            "Impute Numeric",
            "Attribute " + instances.attribute(j) + " - Median: " + medians[j]);
      }
    }
  }
  public static void run(String[] args) throws Exception {
    /**
     * *************************************************
     *
     * @param args[0]: train arff path
     * @param args[1]: test arff path
     */
    DataSource source = new DataSource(args[0]);
    Instances data = source.getDataSet();
    data.setClassIndex(data.numAttributes() - 1);
    NaiveBayes model = new NaiveBayes();
    model.buildClassifier(data);

    // Evaluation:
    Evaluation eval = new Evaluation(data);
    Instances testData = new DataSource(args[1]).getDataSet();
    testData.setClassIndex(testData.numAttributes() - 1);
    eval.evaluateModel(model, testData);
    System.out.println(model.toString());
    System.out.println(eval.toSummaryString("\nResults\n======\n", false));
    System.out.println("======\nConfusion Matrix:");
    double[][] confusionM = eval.confusionMatrix();
    for (int i = 0; i < confusionM.length; ++i) {
      for (int j = 0; j < confusionM[i].length; ++j) {
        System.out.format("%10s ", confusionM[i][j]);
      }
      System.out.print("\n");
    }
  }
  @Override
  protected Instances process(Instances instances) throws Exception {
    Instances result = new Instances(determineOutputFormat(instances), 0);

    Tagger tagger = new Tagger();
    tagger.loadModel("models/model.20120919");

    // reference to the content of the tweet
    Attribute attrCont = instances.attribute("content");

    for (int i = 0; i < instances.numInstances(); i++) {
      double[] values = new double[result.numAttributes()];
      for (int n = 0; n < instances.numAttributes(); n++)
        values[n] = instances.instance(i).value(n);

      String content = instances.instance(i).stringValue(attrCont);
      List<String> words = MyUtils.cleanTokenize(content);
      List<String> posTags = MyUtils.getPOStags(words, tagger);

      // calculate frequencies of different POS tags
      Map<String, Integer> posFreqs = MyUtils.calculateTermFreq(posTags);

      // add POS values
      for (String posTag : posFreqs.keySet()) {
        int index = result.attribute("POS-" + posTag).index();
        values[index] = posFreqs.get(posTag);
      }

      Instance inst = new SparseInstance(1, values);
      result.add(inst);
    }
    return result;
  }
  /**
   * Method for building this classifier.
   *
   * @param training the training instances
   * @param test the test instances
   * @throws Exception if something goes wrong
   */
  public void buildClassifier(Instances training, Instances test) throws Exception {
    m_ClassifierBuilt = true;
    m_Random = new Random(m_Seed);
    m_Trainset = training;
    m_Testset = test;

    // set class index?
    if ((m_Trainset.classIndex() == -1) || (m_Testset.classIndex() == -1)) {
      m_Trainset.setClassIndex(m_Trainset.numAttributes() - 1);
      m_Testset.setClassIndex(m_Trainset.numAttributes() - 1);
    }

    // are datasets correct?
    checkData();

    // any other data restrictions not met?
    checkRestrictions();

    // generate sets
    generateSets();

    // performs the restarts/iterations
    build();

    m_Random = null;
  }
예제 #5
0
  /**
   * Signify that this batch of input to the filter is finished. If the filter requires all
   * instances prior to filtering, output() may now be called to retrieve the filtered instances.
   *
   * @return true if there are instances pending output
   * @throws IllegalStateException if no input structure has been defined
   */
  public boolean batchFinished() throws Exception {

    if (getInputFormat() == null) {
      throw new IllegalStateException("No input instance format defined");
    }
    if (m_attStats == null) {
      Instances input = getInputFormat();

      m_attStats = new AttributeStats[input.numAttributes()];

      for (int i = 0; i < input.numAttributes(); i++) {
        if (input.attribute(i).isNumeric() && (input.classIndex() != i)) {
          m_attStats[i] = input.attributeStats(i);
        }
      }

      // Convert pending input instances
      for (int i = 0; i < input.numInstances(); i++) {
        convertInstance(input.instance(i));
      }
    }
    // Free memory
    flushInput();

    m_NewBatch = true;
    return (numPendingOutput() != 0);
  }
  public Instances transformInstances(MultiLabelInstances mlData) throws Exception {
    labelIndices = mlData.getLabelIndices();
    numOfLabels = mlData.getNumLabels();
    Instances data = mlData.getDataSet();

    Instances transformed = new Instances(mlData.getDataSet(), 0);

    // delete all labels
    transformed = RemoveAllLabels.transformInstances(transformed, labelIndices);

    // add single label attribute
    ArrayList<String> classValues = new ArrayList<String>(numOfLabels);
    for (int x = 0; x < numOfLabels; x++) {
      classValues.add("Class" + (x + 1));
    }
    Attribute newClass = new Attribute("Class", classValues);
    transformed.insertAttributeAt(newClass, transformed.numAttributes());
    transformed.setClassIndex(transformed.numAttributes() - 1);

    for (int instanceIndex = 0; instanceIndex < data.numInstances(); instanceIndex++) {
      // System.out.println(data.instance(instanceIndex).toString());
      List<Instance> result = transformInstance(data.instance(instanceIndex));
      for (Instance instance : result) {
        // System.out.println(instance.toString());
        transformed.add(instance);
        // System.out.println(transformed.instance(transformed.numInstances()-1));
      }
    }
    return transformed;
  }
  public static Instances getNewRandomData(Instances data, int rnd) throws Exception {

    Random rd = new Random(rnd);
    boolean[] resB = new boolean[data.numAttributes()];
    for (int r = 0; r < data.numAttributes(); ++r) resB[r] = rd.nextBoolean();
    int cnt = 0;
    for (int i = 0; i < resB.length - 1; ++i) {
      if (resB[i]) {
        cnt++;
      }
    }
    int[] removeind = new int[resB.length - 1 - cnt];
    int j = 0;
    for (int i = 0; i < resB.length - 1; ++i) {
      if (!resB[i]) {
        removeind[j++] = i;
      }
    }

    Remove m_removeFilter = new Remove();
    m_removeFilter.setAttributeIndicesArray(removeind);
    m_removeFilter.setInvertSelection(false);
    m_removeFilter.setInputFormat(data);
    Instances newData = Filter.useFilter(data, m_removeFilter);
    return newData;
  }
예제 #8
0
  /**
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {
    // TODO Auto-generated method stub

    oneAlgorithm oneAlg = new oneAlgorithm();
    oneAlg.category = xCategory.RSandFCBFalg;
    oneAlg.style = xStyle.fuzzySU;
    oneAlg.flag = false;
    oneAlg.alpha = 2.0;
    // String fn = "C:/Users/Eric/Desktop/2011秋冬/Code/Xreducer/data/Data/wine.arff";
    // String fn = "C:/Users/Eric/Desktop/2011秋冬/Code/Xreducer/data/Data/wdbc.arff";
    String fn = "C:/Users/Eric/Desktop/2011秋冬/Code/Xreducer/data/Data/glass.arff";
    // String fn = "C:/Users/Eric/Desktop/2011秋冬/Code/Xreducer/data/shen/wine-shen.arff";
    // String fn = "C:/Users/Eric/Desktop/2011秋冬/Code/Xreducer/data/fuzzy/fuzzy-ex.arff";
    // String fn = "C:/Users/Eric/Desktop/2011秋冬/Code/Xreducer/data/derm.arff";
    oneFile onef = new oneFile(new File(fn));
    Instances dataset = new Instances(new FileReader(fn));
    dataset.setClassIndex(dataset.numAttributes() - 1);
    onef.ins = dataset.numInstances();
    onef.att = dataset.numAttributes();
    onef.cla = dataset.numClasses();

    RSandFCBFReduceMethod rs = new RSandFCBFReduceMethod(onef, oneAlg);

    boolean[] B = new boolean[rs.NumAttr];
    boolean[] rq = rs.getOneReduction(B);
    System.out.println(Arrays.toString(Utils.boolean2select(rq)));
  }
예제 #9
0
  /**
   * 用分类器测试
   *
   * @param trainFileName
   * @param testFileName
   */
  public static void classify(String trainFileName, String testFileName) {
    try {
      File inputFile = new File(fileName + trainFileName); // 训练语料文件
      ArffLoader atf = new ArffLoader();
      atf.setFile(inputFile);
      Instances instancesTrain = atf.getDataSet(); // 读入训练文件

      // 设置类标签类
      inputFile = new File(fileName + testFileName); // 测试语料文件
      atf.setFile(inputFile);
      Instances instancesTest = atf.getDataSet(); // 读入测试文件

      instancesTest.setClassIndex(instancesTest.numAttributes() - 1);
      instancesTrain.setClassIndex(instancesTrain.numAttributes() - 1);

      classifier = (Classifier) Class.forName(CLASSIFIERNAME).newInstance();
      classifier.buildClassifier(instancesTrain);

      Evaluation eval = new Evaluation(instancesTrain);
      //  第一个为一个训练过的分类器,第二个参数是在某个数据集上评价的数据集
      eval.evaluateModel(classifier, instancesTest);

      System.out.println(eval.toClassDetailsString());
      System.out.println(eval.toSummaryString());
      System.out.println(eval.toMatrixString());
      System.out.println("precision is :" + (1 - eval.errorRate()));

    } catch (Exception e) {
      e.printStackTrace();
    }
  }
예제 #10
0
  /**
   * Builds a regression model for the given data.
   *
   * @param data the training data to be used for generating the linear regression function
   * @throws Exception if the classifier could not be built successfully
   */
  public void buildClassifier(Instances data) throws Exception {

    if (!m_checksTurnedOff) {
      // can classifier handle the data?
      getCapabilities().testWithFail(data);

      // remove instances with missing class
      data = new Instances(data);
      data.deleteWithMissingClass();
    }

    // Preprocess instances
    if (!m_checksTurnedOff) {
      m_TransformFilter = new NominalToBinary();
      m_TransformFilter.setInputFormat(data);
      data = Filter.useFilter(data, m_TransformFilter);
      m_MissingFilter = new ReplaceMissingValues();
      m_MissingFilter.setInputFormat(data);
      data = Filter.useFilter(data, m_MissingFilter);
      data.deleteWithMissingClass();
    } else {
      m_TransformFilter = null;
      m_MissingFilter = null;
    }

    m_ClassIndex = data.classIndex();
    m_TransformedData = data;

    // Turn all attributes on for a start
    m_SelectedAttributes = new boolean[data.numAttributes()];
    for (int i = 0; i < data.numAttributes(); i++) {
      if (i != m_ClassIndex) {
        m_SelectedAttributes[i] = true;
      }
    }
    m_Coefficients = null;

    // Compute means and standard deviations
    m_Means = new double[data.numAttributes()];
    m_StdDevs = new double[data.numAttributes()];
    for (int j = 0; j < data.numAttributes(); j++) {
      if (j != data.classIndex()) {
        m_Means[j] = data.meanOrMode(j);
        m_StdDevs[j] = Math.sqrt(data.variance(j));
        if (m_StdDevs[j] == 0) {
          m_SelectedAttributes[j] = false;
        }
      }
    }

    m_ClassStdDev = Math.sqrt(data.variance(m_TransformedData.classIndex()));
    m_ClassMean = data.meanOrMode(m_TransformedData.classIndex());

    // Perform the regression
    findBestModel();

    // Save memory
    m_TransformedData = new Instances(data, 0);
  }
예제 #11
0
  /**
   * Convert an <code>Instance</code> to an array of values that matches the format of the mining
   * schema. First maps raw attribute values and then applies rules for missing values, outliers
   * etc.
   *
   * @param inst the <code>Instance</code> to convert
   * @param miningSchema the mining schema incoming instance attributes
   * @return an array of doubles that are values from the incoming Instances, correspond to the
   *     format of the mining schema and have had missing values, outliers etc. dealt with.
   * @throws Exception if something goes wrong
   */
  public double[] instanceToSchema(Instance inst, MiningSchema miningSchema) throws Exception {
    Instances miningSchemaI = miningSchema.getMiningSchemaAsInstances();

    // allocate enough space for both mining schema fields and any derived fields
    double[] result = new double[miningSchema.getFieldsAsInstances().numAttributes()];

    // Copy over the values
    for (int i = 0; i < miningSchemaI.numAttributes(); i++) {
      // if (miningSchemaI.attribute(i).isNumeric()) {
      result[i] = inst.value(m_fieldsMap[i]);
      if (miningSchemaI.attribute(i).isNominal() || miningSchemaI.attribute(i).isString()) {
        // If not missing, look up the index of this incoming categorical value in
        // the mining schema
        if (!Utils.isMissingValue(inst.value(m_fieldsMap[i]))) {
          int[] valueMap = m_nominalValueMaps[i];
          int index = valueMap[(int) inst.value(m_fieldsMap[i])];
          String incomingAttValue =
              inst.attribute(m_fieldsMap[i]).value((int) inst.value(m_fieldsMap[i]));
          /*int index = miningSchemaI.attribute(i).indexOfValue(incomingAttValue); */
          if (index >= 0) {
            result[i] = index;
          } else {
            // set this to "unknown" (-1) for nominal valued attributes
            result[i] = UNKNOWN_NOMINAL_VALUE;
            String warningString =
                "[MappingInfo] WARNING: Can't match nominal value " + incomingAttValue;
            if (m_log != null) {
              m_log.logMessage(warningString);
            } else {
              System.err.println(warningString);
            }
          }
        }
      }
    }

    // Now deal with missing values and outliers...
    miningSchema.applyMissingAndOutlierTreatments(result);
    //    printInst(result);

    // now fill in any derived values
    ArrayList<DerivedFieldMetaInfo> derivedFields = miningSchema.getDerivedFields();
    for (int i = 0; i < derivedFields.size(); i++) {
      DerivedFieldMetaInfo temp = derivedFields.get(i);
      //      System.err.println("Applying : " + temp);
      double r = temp.getDerivedValue(result);
      result[i + miningSchemaI.numAttributes()] = r;
    }

    /*System.err.print("==> ");
    for (int i = 0; i < result.length; i++) {
      System.err.print(" " + result[i]);
    }
    System.err.println();*/

    return result;
  }
예제 #12
0
  /** @param args */
  private void Init() {
    testIns.setClassIndex(testIns.numAttributes() - 1);
    labeledIns.setClassIndex(labeledIns.numAttributes() - 1);
    unlabeledIns.setClassIndex(unlabeledIns.numAttributes() - 1);

    class_Array[0] = classifier1;
    class_Array[1] = classifier2;
    class_Array[2] = classifier3;
  }
예제 #13
0
  private RunTrace traceToXML(int file_id, int task_id, int run_id) throws Exception {
    RunTrace trace = new RunTrace(run_id);
    URL traceURL = apiconnector.getOpenmlFileUrl(file_id, "Task_" + task_id + "_trace.arff");
    Instances traceDataset = new Instances(new BufferedReader(Input.getURL(traceURL)));
    List<Integer> parameterIndexes = new ArrayList<Integer>();

    if (traceDataset.attribute("repeat") == null
        || traceDataset.attribute("fold") == null
        || traceDataset.attribute("iteration") == null
        || traceDataset.attribute("evaluation") == null
        || traceDataset.attribute("selected") == null) {
      throw new Exception("trace file missing mandatory attributes. ");
    }

    for (int i = 0; i < traceDataset.numAttributes(); ++i) {
      if (traceDataset.attribute(i).name().startsWith("parameter_")) {
        parameterIndexes.add(i);
      }
    }
    if (parameterIndexes.size() == 0) {
      throw new Exception(
          "trace file contains no fields with prefix 'parameter_' (i.e., parameters are not registered). ");
    }
    if (traceDataset.numAttributes() > 6 + parameterIndexes.size()) {
      throw new Exception(
          "trace file contains illegal attributes (only allow for repeat, fold, iteration, evaluation, selected, setup_string and parameter_*). ");
    }

    for (int i = 0; i < traceDataset.numInstances(); ++i) {
      Instance current = traceDataset.get(i);
      Integer repeat = (int) current.value(traceDataset.attribute("repeat").index());
      Integer fold = (int) current.value(traceDataset.attribute("fold").index());
      Integer iteration = (int) current.value(traceDataset.attribute("iteration").index());
      Double evaluation = current.value(traceDataset.attribute("evaluation").index());
      Boolean selected =
          current.stringValue(traceDataset.attribute("selected").index()).equals("true");

      Map<String, String> parameters = new HashMap<String, String>();
      for (int j = 0; j < parameterIndexes.size(); ++j) {
        int attIdx = parameterIndexes.get(j);
        if (traceDataset.attribute(attIdx).isNumeric()) {
          parameters.put(traceDataset.attribute(attIdx).name(), current.value(attIdx) + "");
        } else {
          parameters.put(traceDataset.attribute(attIdx).name(), current.stringValue(attIdx));
        }
      }
      String setup_string = new JSONObject(parameters).toString();

      trace.addIteration(
          new RunTrace.Trace_iteration(
              repeat, fold, iteration, setup_string, evaluation, selected));
    }

    return trace;
  }
예제 #14
0
  protected void initMinMax(Instances data) {
    m_Min = new double[data.numAttributes()];
    m_Max = new double[data.numAttributes()];
    for (int i = 0; i < data.numAttributes(); i++) {
      m_Min[i] = m_Max[i] = Double.NaN;
    }

    for (int i = 0; i < data.numInstances(); i++) {
      updateMinMax(data.instance(i));
    }
  }
예제 #15
0
  /**
   * Determines the output format based on the input format and returns this. In case the output
   * format cannot be returned immediately, i.e., immediateOutputFormat() returns false, then this
   * method will be called from batchFinished().
   *
   * @param inputFormat the input format to base the output format on
   * @return the output format
   * @throws Exception in case the determination goes wrong
   * @see #hasImmediateOutputFormat()
   * @see #batchFinished()
   */
  protected Instances determineOutputFormat(Instances inputFormat) throws Exception {

    Instances data;
    Instances result;
    FastVector atts;
    FastVector values;
    HashSet hash;
    int i;
    int n;
    boolean isDate;
    Instance inst;
    Vector sorted;

    m_Cols.setUpper(inputFormat.numAttributes() - 1);
    data = new Instances(inputFormat);
    atts = new FastVector();
    for (i = 0; i < data.numAttributes(); i++) {
      if (!m_Cols.isInRange(i) || !data.attribute(i).isNumeric()) {
        atts.addElement(data.attribute(i));
        continue;
      }

      // date attribute?
      isDate = (data.attribute(i).type() == Attribute.DATE);

      // determine all available attribtues in dataset
      hash = new HashSet();
      for (n = 0; n < data.numInstances(); n++) {
        inst = data.instance(n);
        if (inst.isMissing(i)) continue;

        if (isDate) hash.add(inst.stringValue(i));
        else hash.add(new Double(inst.value(i)));
      }

      // sort values
      sorted = new Vector();
      for (Object o : hash) sorted.add(o);
      Collections.sort(sorted);

      // create attribute from sorted values
      values = new FastVector();
      for (Object o : sorted) {
        if (isDate) values.addElement(o.toString());
        else values.addElement(Utils.doubleToString(((Double) o).doubleValue(), MAX_DECIMALS));
      }
      atts.addElement(new Attribute(data.attribute(i).name(), values));
    }

    result = new Instances(inputFormat.relationName(), atts, 0);
    result.setClassIndex(inputFormat.classIndex());

    return result;
  }
  /** Perform the sub task */
  public void execute() {

    m_random = new Random(m_rowNumber * 11);
    m_dataGenerator.setSeed(m_rowNumber * 11);
    m_result = new RemoteResult(m_rowNumber, m_panelWidth);
    m_status.setTaskResult(m_result);
    m_status.setExecutionStatus(TaskStatusInfo.PROCESSING);

    try {
      m_numOfSamplesPerGenerator =
          (int) Math.pow(m_samplesBase, m_trainingData.numAttributes() - 3);
      if (m_trainingData == null) {
        throw new Exception("No training data set (BoundaryPanel)");
      }
      if (m_classifier == null) {
        throw new Exception("No classifier set (BoundaryPanel)");
      }
      if (m_dataGenerator == null) {
        throw new Exception("No data generator set (BoundaryPanel)");
      }
      if (m_trainingData.attribute(m_xAttribute).isNominal()
          || m_trainingData.attribute(m_yAttribute).isNominal()) {
        throw new Exception(
            "Visualization dimensions must be numeric " + "(RemoteBoundaryVisualizerSubTask)");
      }

      m_attsToWeightOn = new boolean[m_trainingData.numAttributes()];
      m_attsToWeightOn[m_xAttribute] = true;
      m_attsToWeightOn[m_yAttribute] = true;

      // generate samples
      m_weightingAttsValues = new double[m_attsToWeightOn.length];
      m_vals = new double[m_trainingData.numAttributes()];
      m_predInst = new Instance(1.0, m_vals);
      m_predInst.setDataset(m_trainingData);

      System.err.println("Executing row number " + m_rowNumber);
      for (int j = 0; j < m_panelWidth; j++) {
        double[] preds = calculateRegionProbs(j, m_rowNumber);
        m_result.setLocationProbs(j, preds);
        m_result.setPercentCompleted((int) (100 * ((double) j / (double) m_panelWidth)));
      }
    } catch (Exception ex) {
      m_status.setExecutionStatus(TaskStatusInfo.FAILED);
      m_status.setStatusMessage("Row " + m_rowNumber + " failed.");
      System.err.print(ex);
      return;
    }

    // finished
    m_status.setExecutionStatus(TaskStatusInfo.FINISHED);
    m_status.setStatusMessage("Row " + m_rowNumber + " completed successfully.");
  }
예제 #17
0
  /**
   * Constructs an instance suitable for passing to the model for scoring
   *
   * @param incoming the incoming instance
   * @return an instance with values mapped to be consistent with what the model is expecting
   */
  protected Instance mapIncomingFieldsToModelFields(Instance incoming) {
    Instances modelHeader = m_model.getHeader();
    double[] vals = new double[modelHeader.numAttributes()];

    for (int i = 0; i < modelHeader.numAttributes(); i++) {

      if (m_attributeMap[i] < 0) {
        // missing or type mismatch
        vals[i] = Utils.missingValue();
        continue;
      }

      Attribute modelAtt = modelHeader.attribute(i);
      Attribute incomingAtt = incoming.dataset().attribute(m_attributeMap[i]);

      if (incoming.isMissing(incomingAtt.index())) {
        vals[i] = Utils.missingValue();
        continue;
      }

      if (modelAtt.isNumeric()) {
        vals[i] = incoming.value(m_attributeMap[i]);
      } else if (modelAtt.isNominal()) {
        String incomingVal = incoming.stringValue(m_attributeMap[i]);
        int modelIndex = modelAtt.indexOfValue(incomingVal);

        if (modelIndex < 0) {
          vals[i] = Utils.missingValue();
        } else {
          vals[i] = modelIndex;
        }
      } else if (modelAtt.isString()) {
        vals[i] = 0;
        modelAtt.setStringValue(incoming.stringValue(m_attributeMap[i]));
      }
    }

    if (modelHeader.classIndex() >= 0) {
      // set class to missing value
      vals[modelHeader.classIndex()] = Utils.missingValue();
    }

    Instance newInst = null;
    if (incoming instanceof SparseInstance) {
      newInst = new SparseInstance(incoming.weight(), vals);
    } else {
      newInst = new DenseInstance(incoming.weight(), vals);
    }

    newInst.setDataset(modelHeader);
    return newInst;
  }
예제 #18
0
  /**
   * Determines the output format based on the input format and returns this. In case the output
   * format cannot be returned immediately, i.e., hasImmediateOutputFormat() returns false, then
   * this method will called from batchFinished() after the call of preprocess(Instances), in which,
   * e.g., statistics for the actual processing step can be gathered.
   *
   * @param inputFormat the input format to base the output format on
   * @return the output format
   * @throws Exception in case the determination goes wrong
   */
  protected Instances determineOutputFormat(Instances inputFormat) throws Exception {
    Instances result;
    FastVector atts;
    int i;
    int numAtts;
    Vector<Integer> indices;
    Vector<Integer> subset;
    Random rand;
    int index;

    // determine the number of attributes
    numAtts = inputFormat.numAttributes();
    if (inputFormat.classIndex() > -1) numAtts--;

    if (m_NumAttributes < 1) {
      numAtts = (int) Math.round((double) numAtts * m_NumAttributes);
    } else {
      if (m_NumAttributes < numAtts) numAtts = (int) m_NumAttributes;
    }
    if (getDebug()) System.out.println("# of atts: " + numAtts);

    // determine random indices
    indices = new Vector<Integer>();
    for (i = 0; i < inputFormat.numAttributes(); i++) {
      if (i == inputFormat.classIndex()) continue;
      indices.add(i);
    }

    subset = new Vector<Integer>();
    rand = new Random(m_Seed);
    for (i = 0; i < numAtts; i++) {
      index = rand.nextInt(indices.size());
      subset.add(indices.get(index));
      indices.remove(index);
    }
    Collections.sort(subset);
    if (inputFormat.classIndex() > -1) subset.add(inputFormat.classIndex());
    if (getDebug()) System.out.println("indices: " + subset);

    // generate output format
    atts = new FastVector();
    m_Indices = new int[subset.size()];
    for (i = 0; i < subset.size(); i++) {
      atts.addElement(inputFormat.attribute(subset.get(i)));
      m_Indices[i] = subset.get(i);
    }
    result = new Instances(inputFormat.relationName(), atts, 0);
    if (inputFormat.classIndex() > -1) result.setClassIndex(result.numAttributes() - 1);

    return result;
  }
예제 #19
0
  public KDTreeBufferCPU(
      Instances dataset, Context ctx, int tree_depth, DenseInstanceBuffer buffer) {
    m_instance_data = buffer;
    m_tree_depth = tree_depth;
    m_leaf_node_ids = new int[buffer.rows()];

    m_branch_nodes = (int) (Math.pow(2, tree_depth + 1) - 1) - (int) Math.pow(2, tree_depth);
    m_dataset = dataset;
    m_num_attributes = dataset.numAttributes();

    m_max_temp = new double[m_dataset.numAttributes() * m_branch_nodes];
    m_min_temp = new double[m_dataset.numAttributes() * m_branch_nodes];
    m_node_split_dim = new int[m_branch_nodes];
    m_node_split_value = new double[m_branch_nodes];
  }
예제 #20
0
  public static double CA(Instances odata, int[] clusters) {
    double result = 0;
    double[] tmpdclass = odata.attributeToDoubleArray(odata.numAttributes() - 1);
    int[] oclass = new int[odata.numInstances()];
    for (int i = 0; i < tmpdclass.length; ++i) {
      oclass[i] = (int) tmpdclass[i];
    }
    int[] tmpclass = oclass.clone();
    int[] tmpclusters = clusters.clone();

    Arrays.sort(tmpclusters);
    Arrays.sort(tmpclass);
    int[][] M = new int[tmpclass[tmpclass.length - 1] + 1][tmpclusters[tmpclusters.length - 1] + 1];

    for (int i = 0; i < clusters.length; ++i) {
      M[oclass[i]][clusters[i]]++;
    }
    for (int i = 0; i < M.length; ++i) {
      System.out.println(Arrays.toString(M[i]));
    }
    for (int i = 0; i < M.length; ++i) {
      int maxindex = -1;
      for (int j = 0; j < M[0].length - 1; ++j) {
        if (M[i][j] < M[i][j + 1]) maxindex = j + 1;
      }
      M[i][0] = maxindex;
    }

    for (int i = 0; i < oclass.length; ++i) {
      if (M[oclass[i]][0] == clusters[i]) result++;
    }

    return (double) result / (double) odata.numInstances();
  }
예제 #21
0
  /**
   * Generate artificial training examples.
   *
   * @param artSize size of examples set to create
   * @param data training data
   * @return the set of unlabeled artificial examples
   */
  protected Instances generateArtificialData(int artSize, Instances data) {
    int numAttributes = data.numAttributes();
    Instances artData = new Instances(data, artSize);
    double[] att;
    Instance artInstance;

    for (int i = 0; i < artSize; i++) {
      att = new double[numAttributes];
      for (int j = 0; j < numAttributes; j++) {
        if (data.attribute(j).isNominal()) {
          // Select nominal value based on the frequency of occurence in the training data
          double[] stats = (double[]) m_AttributeStats.get(j);
          att[j] = (double) selectIndexProbabilistically(stats);
        } else if (data.attribute(j).isNumeric()) {
          // Generate numeric value from the Guassian distribution
          // defined by the mean and std dev of the attribute
          double[] stats = (double[]) m_AttributeStats.get(j);
          att[j] = (m_Random.nextGaussian() * stats[1]) + stats[0];
        } else System.err.println("Decorate can only handle numeric and nominal values.");
      }
      artInstance = new Instance(1.0, att);
      artData.add(artInstance);
    }
    return artData;
  }
예제 #22
0
  /**
   * Compute and store statistics required for generating artificial data.
   *
   * @param data training instances
   * @exception Exception if statistics could not be calculated successfully
   */
  protected void computeStats(Instances data) throws Exception {
    int numAttributes = data.numAttributes();
    m_AttributeStats = new Vector(numAttributes); // use to map attributes to their stats

    for (int j = 0; j < numAttributes; j++) {
      if (data.attribute(j).isNominal()) {
        // Compute the probability of occurence of each distinct value
        int[] nomCounts = (data.attributeStats(j)).nominalCounts;
        double[] counts = new double[nomCounts.length];
        if (counts.length < 2)
          throw new Exception("Nominal attribute has less than two distinct values!");
        // Perform Laplace smoothing
        for (int i = 0; i < counts.length; i++) counts[i] = nomCounts[i] + 1;
        Utils.normalize(counts);
        double[] stats = new double[counts.length - 1];
        stats[0] = counts[0];
        // Calculate cumulative probabilities
        for (int i = 1; i < stats.length; i++) stats[i] = stats[i - 1] + counts[i];
        m_AttributeStats.add(j, stats);
      } else if (data.attribute(j).isNumeric()) {
        // Get mean and standard deviation from the training data
        double[] stats = new double[2];
        stats[0] = data.meanOrMode(j);
        stats[1] = Math.sqrt(data.variance(j));
        m_AttributeStats.add(j, stats);
      } else System.err.println("Decorate can only handle numeric and nominal values.");
    }
  }
 public void testTypical() {
   Instances result = useFilter();
   // Number of attributes and instances shouldn't change
   assertEquals(m_Instances.numAttributes() + 5, result.numAttributes());
   assertEquals(m_Instances.numInstances(), result.numInstances());
   // Eibe can enhance this to check the binarizing is correct.
 }
 /** trains the classifier */
 @Override
 public void train() throws Exception {
   if (_train.classIndex() == -1) _train.setClassIndex(_train.numAttributes() - 1);
   _cl.buildClassifier(_train);
   // evaluate classifier and print some statistics
   evaluate();
 }
예제 #25
0
  /**
   * Sets the format of the input instances.
   *
   * @param instanceInfo an Instances object containing the input instance structure (any instances
   *     contained in the object are ignored - only the structure is required).
   * @return true if the outputFormat may be collected immediately
   * @throws UnsupportedAttributeTypeException if selected attributes are not numeric or nominal.
   */
  public boolean setInputFormat(Instances instanceInfo) throws Exception {

    if ((instanceInfo.classIndex() > 0) && (!getFillWithMissing())) {
      throw new IllegalArgumentException(
          "TimeSeriesTranslate: Need to fill in missing values "
              + "using appropriate option when class index is set.");
    }
    super.setInputFormat(instanceInfo);
    // Create the output buffer
    Instances outputFormat = new Instances(instanceInfo, 0);
    for (int i = 0; i < instanceInfo.numAttributes(); i++) {
      if (i != instanceInfo.classIndex()) {
        if (m_SelectedCols.isInRange(i)) {
          if (outputFormat.attribute(i).isNominal() || outputFormat.attribute(i).isNumeric()) {
            outputFormat.renameAttribute(
                i,
                outputFormat.attribute(i).name()
                    + (m_InstanceRange < 0 ? '-' : '+')
                    + Math.abs(m_InstanceRange));
          } else {
            throw new UnsupportedAttributeTypeException(
                "Only numeric and nominal attributes may be " + " manipulated in time series.");
          }
        }
      }
    }
    outputFormat.setClassIndex(instanceInfo.classIndex());
    setOutputFormat(outputFormat);
    return true;
  }
  /**
   * Calculates the centroid pivot of a node based on the list of points that it contains (tbe two
   * lists of its children are provided).
   *
   * @param list1 The point index list of first child.
   * @param list2 The point index list of second child.
   * @param insts The insts object on which the tree is being built (for header information).
   * @return The centroid pivot of the node.
   */
  public Instance calcPivot(MyIdxList list1, MyIdxList list2, Instances insts) {
    int classIdx = m_Instances.classIndex();
    double[] attrVals = new double[insts.numAttributes()];

    Instance temp;
    for (int i = 0; i < list1.length(); i++) {
      temp = insts.instance(((ListNode) list1.get(i)).idx);
      for (int k = 0; k < temp.numValues(); k++) {
        if (temp.index(k) == classIdx) continue;
        attrVals[k] += temp.valueSparse(k);
      }
    }
    for (int j = 0; j < list2.length(); j++) {
      temp = insts.instance(((ListNode) list2.get(j)).idx);
      for (int k = 0; k < temp.numValues(); k++) {
        if (temp.index(k) == classIdx) continue;
        attrVals[k] += temp.valueSparse(k);
      }
    }
    for (int j = 0, numInsts = list1.length() + list2.length(); j < attrVals.length; j++) {
      attrVals[j] /= numInsts;
    }
    temp = new DenseInstance(1.0, attrVals);
    return temp;
  }
  public static void wekaAlgorithms(Instances data) throws Exception {
    classifier = new FilteredClassifier(); // new instance of tree
    classifier.setClassifier(new NaiveBayes());
    //  classifier.setClassifier(new J48());
    // classifier.setClassifier(new RandomForest());

    //	classifier.setClassifier(new ZeroR());
    //  classifier.setClassifier(new NaiveBayes());
    //     classifier.setClassifier(new IBk());

    data.setClassIndex(data.numAttributes() - 1);
    Evaluation eval = new Evaluation(data);

    int folds = 10;
    eval.crossValidateModel(classifier, data, folds, new Random(1));

    System.out.println("===== Evaluating on filtered (training) dataset =====");
    System.out.println(eval.toSummaryString());
    System.out.println(eval.toClassDetailsString());
    double[][] mat = eval.confusionMatrix();
    System.out.println("========= Confusion Matrix =========");
    for (int i = 0; i < mat.length; i++) {
      for (int j = 0; j < mat.length; j++) {

        System.out.print(mat[i][j] + "  ");
      }
      System.out.println(" ");
    }
  }
예제 #28
0
  /**
   * Tests the ThresholdCurve generation from the command line. The classifier is currently
   * hardcoded. Pipe in an arff file.
   *
   * @param args currently ignored
   */
  public static void main(String[] args) {

    try {

      Instances inst = new Instances(new java.io.InputStreamReader(System.in));
      if (false) {
        System.out.println(ThresholdCurve.getNPointPrecision(inst, 11));
      } else {
        inst.setClassIndex(inst.numAttributes() - 1);
        ThresholdCurve tc = new ThresholdCurve();
        EvaluationUtils eu = new EvaluationUtils();
        Classifier classifier = new weka.classifiers.functions.Logistic();
        FastVector predictions = new FastVector();
        for (int i = 0; i < 2; i++) { // Do two runs.
          eu.setSeed(i);
          predictions.appendElements(eu.getCVPredictions(classifier, inst, 10));
          // System.out.println("\n\n\n");
        }
        Instances result = tc.getCurve(predictions);
        System.out.println(result);
      }
    } catch (Exception ex) {
      ex.printStackTrace();
    }
  }
예제 #29
0
  /**
   * Returns a vector with column names of the dataset, listed in "list". If a column cannot be
   * found or the list is empty the ones from the default list are returned.
   *
   * @param list comma-separated list of attribute names
   * @param defaultList the default list of attribute names
   * @param inst the instances to get the attribute names from
   * @return a vector containing attribute names
   */
  protected Vector determineColumnNames(String list, String defaultList, Instances inst) {
    Vector result;
    Vector atts;
    StringTokenizer tok;
    int i;
    String item;

    // get attribute names
    atts = new Vector();
    for (i = 0; i < inst.numAttributes(); i++) atts.add(inst.attribute(i).name().toLowerCase());

    // process list
    result = new Vector();
    tok = new StringTokenizer(list, ",");
    while (tok.hasMoreTokens()) {
      item = tok.nextToken().toLowerCase();
      if (atts.contains(item)) {
        result.add(item);
      } else {
        result.clear();
        break;
      }
    }

    // do we have to return defaults?
    if (result.size() == 0) {
      tok = new StringTokenizer(defaultList, ",");
      while (tok.hasMoreTokens()) result.add(tok.nextToken().toLowerCase());
    }

    return result;
  }
예제 #30
0
 /**
  * Calculate average of every columns
  *
  * @param inst
  * @return
  */
 public Double[] calculateAverage(Instances inst) {
   Double[] average = new Double[inst.numAttributes() - 1];
   for (int i = 0; i < inst.numAttributes() - 1; i++) {
     average[i] = 0.0;
   }
   for (int i = 0; i < inst.numInstances(); i++) {
     for (int x = 0; x < inst.instance(i).numAttributes() - 1; x++) {
       Instance ins = inst.instance(i);
       if (ins != null && !Double.isNaN(ins.value(x))) average[x] += ins.value(x);
     }
   }
   for (int i = 0; i < inst.numAttributes() - 1; i++) {
     average[i] /= inst.numInstances();
   }
   return average;
 }