예제 #1
0
파일: WekaUtil.java 프로젝트: kulashish/qh
 public static Instances getInstances(String file) throws Exception {
   DataSource datasource = new DataSource(file);
   Instances data = datasource.getDataSet();
   System.out.println("Class index is : " + data.classIndex());
   if (data.classIndex() == -1) data.setClassIndex(data.numAttributes() - 1);
   return data;
 }
  /**
   * Method for building this classifier.
   *
   * @param training the training instances
   * @param test the test instances
   * @throws Exception if something goes wrong
   */
  public void buildClassifier(Instances training, Instances test) throws Exception {
    m_ClassifierBuilt = true;
    m_Random = new Random(m_Seed);
    m_Trainset = training;
    m_Testset = test;

    // set class index?
    if ((m_Trainset.classIndex() == -1) || (m_Testset.classIndex() == -1)) {
      m_Trainset.setClassIndex(m_Trainset.numAttributes() - 1);
      m_Testset.setClassIndex(m_Trainset.numAttributes() - 1);
    }

    // are datasets correct?
    checkData();

    // any other data restrictions not met?
    checkRestrictions();

    // generate sets
    generateSets();

    // performs the restarts/iterations
    build();

    m_Random = null;
  }
예제 #3
0
  /**
   * Sets the format of the input instances.
   *
   * @param instanceInfo an Instances object containing the input instance structure (any instances
   *     contained in the object are ignored - only the structure is required).
   * @return true if the outputFormat may be collected immediately
   * @throws UnsupportedAttributeTypeException if selected attributes are not numeric or nominal.
   */
  public boolean setInputFormat(Instances instanceInfo) throws Exception {

    if ((instanceInfo.classIndex() > 0) && (!getFillWithMissing())) {
      throw new IllegalArgumentException(
          "TimeSeriesTranslate: Need to fill in missing values "
              + "using appropriate option when class index is set.");
    }
    super.setInputFormat(instanceInfo);
    // Create the output buffer
    Instances outputFormat = new Instances(instanceInfo, 0);
    for (int i = 0; i < instanceInfo.numAttributes(); i++) {
      if (i != instanceInfo.classIndex()) {
        if (m_SelectedCols.isInRange(i)) {
          if (outputFormat.attribute(i).isNominal() || outputFormat.attribute(i).isNumeric()) {
            outputFormat.renameAttribute(
                i,
                outputFormat.attribute(i).name()
                    + (m_InstanceRange < 0 ? '-' : '+')
                    + Math.abs(m_InstanceRange));
          } else {
            throw new UnsupportedAttributeTypeException(
                "Only numeric and nominal attributes may be " + " manipulated in time series.");
          }
        }
      }
    }
    outputFormat.setClassIndex(instanceInfo.classIndex());
    setOutputFormat(outputFormat);
    return true;
  }
예제 #4
0
  /**
   * Builds a regression model for the given data.
   *
   * @param data the training data to be used for generating the linear regression function
   * @throws Exception if the classifier could not be built successfully
   */
  public void buildClassifier(Instances data) throws Exception {

    if (!m_checksTurnedOff) {
      // can classifier handle the data?
      getCapabilities().testWithFail(data);

      // remove instances with missing class
      data = new Instances(data);
      data.deleteWithMissingClass();
    }

    // Preprocess instances
    if (!m_checksTurnedOff) {
      m_TransformFilter = new NominalToBinary();
      m_TransformFilter.setInputFormat(data);
      data = Filter.useFilter(data, m_TransformFilter);
      m_MissingFilter = new ReplaceMissingValues();
      m_MissingFilter.setInputFormat(data);
      data = Filter.useFilter(data, m_MissingFilter);
      data.deleteWithMissingClass();
    } else {
      m_TransformFilter = null;
      m_MissingFilter = null;
    }

    m_ClassIndex = data.classIndex();
    m_TransformedData = data;

    // Turn all attributes on for a start
    m_SelectedAttributes = new boolean[data.numAttributes()];
    for (int i = 0; i < data.numAttributes(); i++) {
      if (i != m_ClassIndex) {
        m_SelectedAttributes[i] = true;
      }
    }
    m_Coefficients = null;

    // Compute means and standard deviations
    m_Means = new double[data.numAttributes()];
    m_StdDevs = new double[data.numAttributes()];
    for (int j = 0; j < data.numAttributes(); j++) {
      if (j != data.classIndex()) {
        m_Means[j] = data.meanOrMode(j);
        m_StdDevs[j] = Math.sqrt(data.variance(j));
        if (m_StdDevs[j] == 0) {
          m_SelectedAttributes[j] = false;
        }
      }
    }

    m_ClassStdDev = Math.sqrt(data.variance(m_TransformedData.classIndex()));
    m_ClassMean = data.meanOrMode(m_TransformedData.classIndex());

    // Perform the regression
    findBestModel();

    // Save memory
    m_TransformedData = new Instances(data, 0);
  }
예제 #5
0
 /** Builds the clusters */
 private void buildClusterer() throws Exception {
   if (m_trainingSet.classIndex() < 0) m_Clusterer.buildClusterer(m_trainingSet);
   else { // class based evaluation if class attribute is set
     Remove removeClass = new Remove();
     removeClass.setAttributeIndices("" + (m_trainingSet.classIndex() + 1));
     removeClass.setInvertSelection(false);
     removeClass.setInputFormat(m_trainingSet);
     Instances clusterTrain = Filter.useFilter(m_trainingSet, removeClass);
     m_Clusterer.buildClusterer(clusterTrain);
   }
 }
예제 #6
0
  /**
   * Constructs an instance suitable for passing to the model for scoring
   *
   * @param incoming the incoming instance
   * @return an instance with values mapped to be consistent with what the model is expecting
   */
  protected Instance mapIncomingFieldsToModelFields(Instance incoming) {
    Instances modelHeader = m_model.getHeader();
    double[] vals = new double[modelHeader.numAttributes()];

    for (int i = 0; i < modelHeader.numAttributes(); i++) {

      if (m_attributeMap[i] < 0) {
        // missing or type mismatch
        vals[i] = Utils.missingValue();
        continue;
      }

      Attribute modelAtt = modelHeader.attribute(i);
      Attribute incomingAtt = incoming.dataset().attribute(m_attributeMap[i]);

      if (incoming.isMissing(incomingAtt.index())) {
        vals[i] = Utils.missingValue();
        continue;
      }

      if (modelAtt.isNumeric()) {
        vals[i] = incoming.value(m_attributeMap[i]);
      } else if (modelAtt.isNominal()) {
        String incomingVal = incoming.stringValue(m_attributeMap[i]);
        int modelIndex = modelAtt.indexOfValue(incomingVal);

        if (modelIndex < 0) {
          vals[i] = Utils.missingValue();
        } else {
          vals[i] = modelIndex;
        }
      } else if (modelAtt.isString()) {
        vals[i] = 0;
        modelAtt.setStringValue(incoming.stringValue(m_attributeMap[i]));
      }
    }

    if (modelHeader.classIndex() >= 0) {
      // set class to missing value
      vals[modelHeader.classIndex()] = Utils.missingValue();
    }

    Instance newInst = null;
    if (incoming instanceof SparseInstance) {
      newInst = new SparseInstance(incoming.weight(), vals);
    } else {
      newInst = new DenseInstance(incoming.weight(), vals);
    }

    newInst.setDataset(modelHeader);
    return newInst;
  }
예제 #7
0
  /**
   * Determines the output format based on the input format and returns this. In case the output
   * format cannot be returned immediately, i.e., hasImmediateOutputFormat() returns false, then
   * this method will called from batchFinished() after the call of preprocess(Instances), in which,
   * e.g., statistics for the actual processing step can be gathered.
   *
   * @param inputFormat the input format to base the output format on
   * @return the output format
   * @throws Exception in case the determination goes wrong
   */
  protected Instances determineOutputFormat(Instances inputFormat) throws Exception {
    Instances result;
    FastVector atts;
    int i;
    int numAtts;
    Vector<Integer> indices;
    Vector<Integer> subset;
    Random rand;
    int index;

    // determine the number of attributes
    numAtts = inputFormat.numAttributes();
    if (inputFormat.classIndex() > -1) numAtts--;

    if (m_NumAttributes < 1) {
      numAtts = (int) Math.round((double) numAtts * m_NumAttributes);
    } else {
      if (m_NumAttributes < numAtts) numAtts = (int) m_NumAttributes;
    }
    if (getDebug()) System.out.println("# of atts: " + numAtts);

    // determine random indices
    indices = new Vector<Integer>();
    for (i = 0; i < inputFormat.numAttributes(); i++) {
      if (i == inputFormat.classIndex()) continue;
      indices.add(i);
    }

    subset = new Vector<Integer>();
    rand = new Random(m_Seed);
    for (i = 0; i < numAtts; i++) {
      index = rand.nextInt(indices.size());
      subset.add(indices.get(index));
      indices.remove(index);
    }
    Collections.sort(subset);
    if (inputFormat.classIndex() > -1) subset.add(inputFormat.classIndex());
    if (getDebug()) System.out.println("indices: " + subset);

    // generate output format
    atts = new FastVector();
    m_Indices = new int[subset.size()];
    for (i = 0; i < subset.size(); i++) {
      atts.addElement(inputFormat.attribute(subset.get(i)));
      m_Indices[i] = subset.get(i);
    }
    result = new Instances(inputFormat.relationName(), atts, 0);
    if (inputFormat.classIndex() > -1) result.setClassIndex(result.numAttributes() - 1);

    return result;
  }
  public double ExpectedClassificationError(Instances pool, int attr_i) {

    // initialize alpha's to one
    int alpha[][][];
    int NumberOfFeatures = pool.numAttributes() - 1;
    int NumberOfLabels = pool.numClasses();

    alpha = new int[NumberOfFeatures][NumberOfLabels][];
    for (int i = 0; i < NumberOfFeatures; i++)
      for (int j = 0; j < NumberOfLabels; j++) alpha[i][j] = new int[pool.attribute(i).numValues()];

    for (int i = 0; i < NumberOfFeatures; i++)
      for (int j = 0; j < NumberOfLabels; j++)
        for (int k = 0; k < alpha[i][j].length; k++) alpha[i][j][k] = 1;

    // construct alpha's
    for (int i = 0; i < NumberOfFeatures; i++) // for each attribute
    {
      if (i == pool.classIndex()) // skip the class attribute
      i++;
      for (Enumeration<Instance> e = pool.enumerateInstances();
          e.hasMoreElements(); ) // for each instance
      {
        Instance inst = e.nextElement();
        if (!inst.isMissing(i)) // if attribute i is not missing (i.e. its been bought)
        {
          int j = (int) inst.classValue();
          int k = (int) inst.value(i);
          alpha[i][j][k]++;
        }
      }
    }
    return ExpectedClassificationError(alpha, attr_i);
  }
예제 #9
0
  /**
   * Returns a string representation of the classifier.
   *
   * @return a string representation of the classifier
   */
  public String toString() {
    StringBuffer result =
        new StringBuffer(
            "The independent probability of a class\n--------------------------------------\n");

    for (int c = 0; c < m_numClasses; c++)
      result
          .append(m_headerInfo.classAttribute().value(c))
          .append("\t")
          .append(Double.toString(m_probOfClass[c]))
          .append("\n");

    result.append(
        "\nThe probability of a word given the class\n-----------------------------------------\n\t");

    for (int c = 0; c < m_numClasses; c++)
      result.append(m_headerInfo.classAttribute().value(c)).append("\t");

    result.append("\n");

    for (int w = 0; w < m_numAttributes; w++) {
      if (w != m_headerInfo.classIndex()) {
        result.append(m_headerInfo.attribute(w).name()).append("\t");
        for (int c = 0; c < m_numClasses; c++)
          result.append(Double.toString(Math.exp(m_probOfWordGivenClass[c][w]))).append("\t");
        result.append("\n");
      }
    }

    return result.toString();
  }
예제 #10
0
  /**
   * Returns the Capabilities of this filter, customized based on the data. I.e., if removes all
   * class capabilities, in case there's not class attribute present or removes the NO_CLASS
   * capability, in case that there's a class present.
   *
   * @param data the data to use for customization
   * @return the capabilities of this object, based on the data
   * @see #getCapabilities()
   */
  public Capabilities getCapabilities(Instances data) {
    Capabilities result;
    Capabilities classes;
    Iterator iter;
    Capability cap;

    result = getCapabilities();

    // no class? -> remove all class capabilites apart from NO_CLASS
    if (data.classIndex() == -1) {
      classes = result.getClassCapabilities();
      iter = classes.capabilities();
      while (iter.hasNext()) {
        cap = (Capability) iter.next();
        if (cap != Capability.NO_CLASS) {
          result.disable(cap);
          result.disableDependency(cap);
        }
      }
    }
    // class? -> remove NO_CLASS
    else {
      result.disable(Capability.NO_CLASS);
      result.disableDependency(Capability.NO_CLASS);
    }

    return result;
  }
예제 #11
0
  /**
   * initializes the algorithm
   *
   * @param data the data to work with
   * @throws Exception if m_SVM is null
   */
  protected void init(Instances data) throws Exception {
    if (m_SVM == null) {
      throw new Exception("SVM not initialized in optimizer. Use RegOptimizer.setSVMReg()");
    }
    m_C = m_SVM.getC();
    m_data = data;
    m_classIndex = data.classIndex();
    m_nInstances = data.numInstances();

    // Initialize kernel
    m_kernel = Kernel.makeCopy(m_SVM.getKernel());
    m_kernel.buildKernel(data);

    // init m_target
    m_target = new double[m_nInstances];
    for (int i = 0; i < m_nInstances; i++) {
      m_target[i] = data.instance(i).classValue();
    }

    m_random = new Random(m_nSeed);

    //		initialize alpha and alpha* array to all zero
    m_alpha = new double[m_target.length];
    m_alphaStar = new double[m_target.length];

    m_supportVectors = new SMOset(m_nInstances);

    m_b = 0.0;
    m_nEvals = 0;
    m_nCacheHits = -1;
  }
  /** test the batch saving/loading (via setFile(File)). */
  public void testBatch() {
    Instances data;

    try {
      // save
      m_Saver.setInstances(m_Instances);
      m_Saver.setFile(new File(m_ExportFilename));
      m_Saver.writeBatch();

      // load
      ((AbstractFileLoader) m_Loader).setFile(new File(m_ExportFilename));
      data = m_Loader.getDataSet();

      // compare data
      try {
        if (m_Instances.classIndex() != data.classIndex()) {
          data.setClassIndex(m_Instances.classIndex());
        }
        compareDatasets(m_Instances, data);
      } catch (Exception e) {
        fail("Incremental load failed (datasets differ): " + e.toString());
      }
    } catch (Exception e) {
      e.printStackTrace();
      fail("Batch save/load failed: " + e.toString());
    }
  }
  /** tests whether a URL can be loaded (via setURL(URL)). */
  public void testURLSourcedLoader() {
    Instances data;

    if (!(getLoader() instanceof URLSourcedLoader)) {
      return;
    }

    try {
      // save
      m_Saver.setInstances(m_Instances);
      m_Saver.setFile(new File(m_ExportFilename));
      m_Saver.writeBatch();

      // load
      ((URLSourcedLoader) m_Loader).setURL(new File(m_ExportFilename).toURI().toURL().toString());
      data = m_Loader.getDataSet();

      // compare data
      try {
        if (m_Instances.classIndex() != data.classIndex()) {
          data.setClassIndex(m_Instances.classIndex());
        }
        compareDatasets(m_Instances, data);
      } catch (Exception e) {
        fail("URL load failed (datasets differ): " + e.toString());
      }
    } catch (Exception e) {
      e.printStackTrace();
      fail("URL load failed: " + e.toString());
    }
  }
예제 #14
0
  /**
   * Signify that this batch of input to the filter is finished. If the filter requires all
   * instances prior to filtering, output() may now be called to retrieve the filtered instances.
   *
   * @return true if there are instances pending output
   * @throws IllegalStateException if no input structure has been defined
   */
  public boolean batchFinished() throws Exception {

    if (getInputFormat() == null) {
      throw new IllegalStateException("No input instance format defined");
    }
    if (m_attStats == null) {
      Instances input = getInputFormat();

      m_attStats = new AttributeStats[input.numAttributes()];

      for (int i = 0; i < input.numAttributes(); i++) {
        if (input.attribute(i).isNumeric() && (input.classIndex() != i)) {
          m_attStats[i] = input.attributeStats(i);
        }
      }

      // Convert pending input instances
      for (int i = 0; i < input.numInstances(); i++) {
        convertInstance(input.instance(i));
      }
    }
    // Free memory
    flushInput();

    m_NewBatch = true;
    return (numPendingOutput() != 0);
  }
 /** trains the classifier */
 @Override
 public void train() throws Exception {
   if (_train.classIndex() == -1) _train.setClassIndex(_train.numAttributes() - 1);
   _cl.buildClassifier(_train);
   // evaluate classifier and print some statistics
   evaluate();
 }
  /** tests whether data can be loaded via setSource() with a file stream. */
  public void testLoaderWithStream() {
    Instances data;

    try {
      // save
      m_Saver.setInstances(m_Instances);
      m_Saver.setFile(new File(m_ExportFilename));
      m_Saver.writeBatch();

      // load
      m_Loader.setSource(new FileInputStream(new File(m_ExportFilename)));
      data = m_Loader.getDataSet();

      // compare data
      try {
        if (m_Instances.classIndex() != data.classIndex()) {
          data.setClassIndex(m_Instances.classIndex());
        }
        compareDatasets(m_Instances, data);
      } catch (Exception e) {
        fail("File stream loading failed (datasets differ): " + e.toString());
      }
    } catch (Exception e) {
      e.printStackTrace();
      fail("File stream loading failed: " + e.toString());
    }
  }
예제 #17
0
파일: PCC.java 프로젝트: nicoladimauro/meka
 /**
  * GetKs - return [K_1,K_2,...,K_L] where each Y_j \in {1,...,K_j}. In the multi-label case, K[j]
  * = 2 for all j = 1,...,L.
  *
  * @param D a dataset
  * @return an array of the number of values that each label can take
  */
 private static int[] getKs(Instances D) {
   int L = D.classIndex();
   int K[] = new int[L];
   for (int k = 0; k < L; k++) {
     K[k] = D.attribute(k).numValues();
   }
   return K;
 }
예제 #18
0
  /**
   * Return the full data set. If the structure hasn't yet been determined by a call to getStructure
   * then method should do so before processing the rest of the data set.
   *
   * @return the structure of the data set as an empty set of Instances
   * @throws IOException if there is no source or parsing fails
   */
  public Instances getDataSet() throws IOException {
    if (getDirectory() == null) throw new IOException("No directory/source has been specified");

    String directoryPath = getDirectory().getAbsolutePath();
    ArrayList<String> classes = new ArrayList<String>();
    Enumeration enm = getStructure().classAttribute().enumerateValues();
    while (enm.hasMoreElements()) classes.add((String) enm.nextElement());

    Instances data = getStructure();
    int fileCount = 0;
    for (int k = 0; k < classes.size(); k++) {
      String subdirPath = (String) classes.get(k);
      File subdir = new File(directoryPath + File.separator + subdirPath);
      String[] files = subdir.list();
      for (int j = 0; j < files.length; j++) {
        try {
          fileCount++;
          if (getDebug())
            System.err.println("processing " + fileCount + " : " + subdirPath + " : " + files[j]);

          double[] newInst = null;
          if (m_OutputFilename) newInst = new double[3];
          else newInst = new double[2];
          File txt =
              new File(directoryPath + File.separator + subdirPath + File.separator + files[j]);
          BufferedReader is;
          if (m_charSet == null || m_charSet.length() == 0) {
            is = new BufferedReader(new InputStreamReader(new FileInputStream(txt)));
          } else {
            is = new BufferedReader(new InputStreamReader(new FileInputStream(txt), m_charSet));
          }
          StringBuffer txtStr = new StringBuffer();
          int c;
          while ((c = is.read()) != -1) {
            txtStr.append((char) c);
          }

          newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
          if (m_OutputFilename)
            newInst[1] =
                (double) data.attribute(1).addStringValue(subdirPath + File.separator + files[j]);
          newInst[data.classIndex()] = (double) k;
          data.add(new DenseInstance(1.0, newInst));
          is.close();
        } catch (Exception e) {
          System.err.println(
              "failed to convert file: "
                  + directoryPath
                  + File.separator
                  + subdirPath
                  + File.separator
                  + files[j]);
        }
      }
    }

    return data;
  }
 /** evaluates the classifier */
 @Override
 public void evaluate() throws Exception {
   // evaluate classifier and print some statistics
   if (_test.classIndex() == -1) _test.setClassIndex(_test.numAttributes() - 1);
   Evaluation eval = new Evaluation(_train);
   eval.evaluateModel(_cl, _test);
   System.out.println(eval.toSummaryString("\nResults\n======\n", false));
   System.out.println(eval.toMatrixString());
 }
 public static void main(String[] args) throws Exception {
   BufferedReader reader = new BufferedReader(new FileReader("PCAin.arff"));
   Instances data = new Instances(reader);
   reader.close();
   if (data.classIndex() == -1) {
     data.setClassIndex(data.numAttributes() - 1);
   }
   pca(data);
 }
예제 #21
0
  /**
   * Searches the attribute subset space using a genetic algorithm.
   *
   * @param ASEval the attribute evaluator to guide the search
   * @param data the training instances.
   * @return an array (not necessarily ordered) of selected attribute indexes
   * @throws Exception if the search can't be completed
   */
  @Override
  public int[] search(ASEvaluation ASEval, Instances data) throws Exception {

    m_best = null;
    m_generationReports = new StringBuffer();

    if (!(ASEval instanceof SubsetEvaluator)) {
      throw new Exception(ASEval.getClass().getName() + " is not a " + "Subset evaluator!");
    }

    if (ASEval instanceof UnsupervisedSubsetEvaluator) {
      m_hasClass = false;
    } else {
      m_hasClass = true;
      m_classIndex = data.classIndex();
    }

    SubsetEvaluator ASEvaluator = (SubsetEvaluator) ASEval;
    m_numAttribs = data.numAttributes();

    m_startRange.setUpper(m_numAttribs - 1);
    if (!(getStartSet().equals(""))) {
      m_starting = m_startRange.getSelection();
    }

    // initial random population
    m_lookupTable = new Hashtable<BitSet, GABitSet>(m_lookupTableSize);
    m_random = new Random(m_seed);
    m_population = new GABitSet[m_popSize];

    // set up random initial population
    initPopulation();
    evaluatePopulation(ASEvaluator);
    populationStatistics();
    scalePopulation();
    checkBest();
    m_generationReports.append(populationReport(0));

    boolean converged;
    for (int i = 1; i <= m_maxGenerations; i++) {
      generation();
      evaluatePopulation(ASEvaluator);
      populationStatistics();
      scalePopulation();
      // find the best pop member and check for convergence
      converged = checkBest();

      if ((i == m_maxGenerations) || ((i % m_reportFrequency) == 0) || (converged == true)) {
        m_generationReports.append(populationReport(i));
        if (converged == true) {
          break;
        }
      }
    }
    return attributeList(m_best.getChromosome());
  }
  /**
   * Method for building this classifier. Since the collective classifiers also need the test set,
   * we only store here the training set.
   *
   * @param training the training set to use
   * @throws Exception derived classes may throw Exceptions
   */
  public void buildClassifier(Instances training) throws Exception {
    m_ClassifierBuilt = false;
    m_Trainset = training;

    // set class index?
    if (m_Trainset.classIndex() == -1) m_Trainset.setClassIndex(m_Trainset.numAttributes() - 1);

    // necessary for JUnit tests
    checkRestrictions();
  }
예제 #23
0
  /**
   * Determines the output format based on the input format and returns this. In case the output
   * format cannot be returned immediately, i.e., immediateOutputFormat() returns false, then this
   * method will be called from batchFinished().
   *
   * @param inputFormat the input format to base the output format on
   * @return the output format
   * @throws Exception in case the determination goes wrong
   * @see #hasImmediateOutputFormat()
   * @see #batchFinished()
   */
  protected Instances determineOutputFormat(Instances inputFormat) throws Exception {

    Instances data;
    Instances result;
    FastVector atts;
    FastVector values;
    HashSet hash;
    int i;
    int n;
    boolean isDate;
    Instance inst;
    Vector sorted;

    m_Cols.setUpper(inputFormat.numAttributes() - 1);
    data = new Instances(inputFormat);
    atts = new FastVector();
    for (i = 0; i < data.numAttributes(); i++) {
      if (!m_Cols.isInRange(i) || !data.attribute(i).isNumeric()) {
        atts.addElement(data.attribute(i));
        continue;
      }

      // date attribute?
      isDate = (data.attribute(i).type() == Attribute.DATE);

      // determine all available attribtues in dataset
      hash = new HashSet();
      for (n = 0; n < data.numInstances(); n++) {
        inst = data.instance(n);
        if (inst.isMissing(i)) continue;

        if (isDate) hash.add(inst.stringValue(i));
        else hash.add(new Double(inst.value(i)));
      }

      // sort values
      sorted = new Vector();
      for (Object o : hash) sorted.add(o);
      Collections.sort(sorted);

      // create attribute from sorted values
      values = new FastVector();
      for (Object o : sorted) {
        if (isDate) values.addElement(o.toString());
        else values.addElement(Utils.doubleToString(((Double) o).doubleValue(), MAX_DECIMALS));
      }
      atts.addElement(new Attribute(data.attribute(i).name(), values));
    }

    result = new Instances(inputFormat.relationName(), atts, 0);
    result.setClassIndex(inputFormat.classIndex());

    return result;
  }
예제 #24
0
  private static void evaluateClassifier(Classifier c, Instances trainData, Instances testData)
      throws Exception {
    System.err.println(
        "INFO: Starting split validation to predict '"
            + trainData.classAttribute().name()
            + "' using '"
            + c.getClass().getCanonicalName()
            + ":"
            + Arrays.toString(c.getOptions())
            + "' (#train="
            + trainData.numInstances()
            + ",#test="
            + testData.numInstances()
            + ") ...");

    if (trainData.classIndex() < 0) throw new IllegalStateException("class attribute not set");

    c.buildClassifier(trainData);
    Evaluation eval = new Evaluation(testData);
    eval.useNoPriors();
    double[] predictions = eval.evaluateModel(c, testData);

    System.out.println(eval.toClassDetailsString());
    System.out.println(eval.toSummaryString("\nResults\n======\n", false));

    // write predictions to file
    {
      System.err.println("INFO: Writing predictions to file ...");
      Writer out = new FileWriter("prediction.trec");
      writePredictionsTrecEval(predictions, testData, 0, trainData.classIndex(), out);
      out.close();
    }

    // write predicted distributions to CSV
    {
      System.err.println("INFO: Writing predicted distributions to CSV ...");
      Writer out = new FileWriter("predicted_distribution.csv");
      writePredictedDistributions(c, testData, 0, out);
      out.close();
    }
  }
예제 #25
0
  /**
   * Calculates the distance between two instances
   *
   * @param test the first instance
   * @param train the second instance
   * @return the distance between the two given instances, between 0 and 1
   */
  protected double distance(Instance first, Instance second) {

    double distance = 0;
    int firstI, secondI;

    for (int p1 = 0, p2 = 0; p1 < first.numValues() || p2 < second.numValues(); ) {
      if (p1 >= first.numValues()) {
        firstI = m_instances.numAttributes();
      } else {
        firstI = first.index(p1);
      }
      if (p2 >= second.numValues()) {
        secondI = m_instances.numAttributes();
      } else {
        secondI = second.index(p2);
      }
      if (firstI == m_instances.classIndex()) {
        p1++;
        continue;
      }
      if (secondI == m_instances.classIndex()) {
        p2++;
        continue;
      }
      double diff;
      if (firstI == secondI) {
        diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2));
        p1++;
        p2++;
      } else if (firstI > secondI) {
        diff = difference(secondI, 0, second.valueSparse(p2));
        p2++;
      } else {
        diff = difference(firstI, first.valueSparse(p1), 0);
        p1++;
      }
      distance += diff * diff;
    }

    return Math.sqrt(distance / m_instances.numAttributes());
  }
예제 #26
0
파일: Wavelet.java 프로젝트: dachylong/weka
  /**
   * processes the instances using the HAAR algorithm
   *
   * @param instances the data to process
   * @return the modified data
   * @throws Exception in case the processing goes wrong
   */
  protected Instances processHAAR(Instances instances) throws Exception {
    Instances result;
    int i;
    int n;
    int j;
    int clsIdx;
    double[] oldVal;
    double[] newVal;
    int level;
    int length;
    double[] clsVal;
    Attribute clsAtt;

    clsIdx = instances.classIndex();
    clsVal = null;
    clsAtt = null;
    if (clsIdx > -1) {
      clsVal = instances.attributeToDoubleArray(clsIdx);
      clsAtt = (Attribute) instances.classAttribute().copy();
      instances.setClassIndex(-1);
      instances.deleteAttributeAt(clsIdx);
    }
    result = new Instances(instances, 0);
    level = (int) StrictMath.ceil(StrictMath.log(instances.numAttributes()) / StrictMath.log(2.0));

    for (i = 0; i < instances.numInstances(); i++) {
      oldVal = instances.instance(i).toDoubleArray();
      newVal = new double[oldVal.length];

      for (n = level; n > 0; n--) {
        length = (int) StrictMath.pow(2, n - 1);

        for (j = 0; j < length; j++) {
          newVal[j] = (oldVal[j * 2] + oldVal[j * 2 + 1]) / StrictMath.sqrt(2);
          newVal[j + length] = (oldVal[j * 2] - oldVal[j * 2 + 1]) / StrictMath.sqrt(2);
        }

        System.arraycopy(newVal, 0, oldVal, 0, newVal.length);
      }

      // add new transformed instance
      result.add(new DenseInstance(1, newVal));
    }

    // add class again
    if (clsIdx > -1) {
      result.insertAttributeAt(clsAtt, clsIdx);
      result.setClassIndex(clsIdx);
      for (i = 0; i < clsVal.length; i++) result.instance(i).setClassValue(clsVal[i]);
    }

    return result;
  }
예제 #27
0
  // use the learned classifiers to get conditional probability
  protected double conMI(Instances D_j, Instances D_k, CNode[][] miNodes, int j, int k)
      throws Exception {

    int L = D_j.classIndex();
    int N = D_j.numInstances();
    double y[] = new double[L];
    double I = 0.0; // conditional mutual information for y_j and y_k
    double p_1, p_2; // p( y_j = 1 | x ), p( y_j = 2 | x )
    double p_12[] = {
      0.0, 0.0
    }; // p_12[0] = p( y_j = 1 | y_k = 0, x ) and p_12[1] = p( y_j = 1 | y_k = 1, x )

    for (int i = 0; i < N; i++) {
      Arrays.fill(y, 0);
      p_1 =
          Math.max(
              miNodes[j][0].distribution((Instance) D_j.instance(i).copy(), y)[1],
              0.000001); // p( y_j = 1 | x )
      p_1 = Math.min(p_1, 0.999999);
      p_1 = Math.max(p_1, 0.000001);
      Arrays.fill(y, 0);
      p_2 =
          Math.max(
              miNodes[k][0].distribution((Instance) D_k.instance(i).copy(), y)[1],
              0.000001); // p( y_k = 1 | x )
      p_2 = Math.min(p_2, 0.999999);
      p_2 = Math.max(p_2, 0.000001);
      Arrays.fill(y, 0);
      p_12[0] =
          Math.max(
              miNodes[j][k - j].distribution((Instance) D_j.instance(i).copy(), y)[1],
              0.000001); // p( y_j = 1 | y_k = 0, x )
      p_12[0] = Math.min(p_12[0], 0.999999);
      p_12[0] = Math.max(p_12[0], 0.000001);
      Arrays.fill(y, 0);
      Arrays.fill(y, k, k + 1, 1.0);
      p_12[1] =
          Math.max(
              miNodes[j][k - j].distribution((Instance) D_j.instance(i).copy(), y)[1],
              0.000001); // p( y_j = 1 | y_k = 1, x )
      p_12[1] = Math.min(p_12[1], 0.999999);
      p_12[1] = Math.max(p_12[1], 0.000001);

      I +=
          (1 - p_12[0]) * (1 - p_2) * Math.log((1 - p_12[0]) / (1 - p_1)); // I( y_j = 0 ; y_k = 0 )
      I += (1 - p_12[1]) * (p_2) * Math.log((1 - p_12[1]) / (1 - p_1)); // I( y_j = 0 ; y_k = 1 )
      I += (p_12[0]) * (1 - p_2) * Math.log((p_12[0]) / (p_1)); // I( y_j = 1 ; y_k = 0 )
      I += (p_12[1]) * (p_2) * Math.log((p_12[1]) / (p_1)); // I( y_j = 1 ; y_k = 0 )
    }
    I = I / N;
    return I;
  }
예제 #28
0
  @Override
  public void train(Instances instance) {

    // find the best attribute
    int classIdx = instance.classIndex();
    for (int i = 0; i < instance.numInstances(); i++) {
      if (classIdx == 0) {
        zeroIns.add(instance.instance(i));
      } else {
        oneIns.add(instance.instance(i));
      }
    }
  }
예제 #29
0
파일: CNode.java 프로젝트: Waikato/meka
 /**
  * Transform.
  *
  * @param D original Instances
  * @param c to be the class Attribute
  * @param pa_c the parent indices of c
  * @return new Instances T
  */
 public static Instances transform(Instances D, int c, int pa_c[]) throws Exception {
   int L = D.classIndex();
   int keep[] = A.append(pa_c, c); // keep all parents and self!
   Arrays.sort(keep);
   int remv[] = A.invert(keep, L); // i.e., remove the rest < L
   Arrays.sort(remv);
   Instances T = F.remove(new Instances(D), remv, false);
   int map[] = new int[L];
   for (int j = 0; j < L; j++) {
     map[j] = Arrays.binarySearch(keep, j);
   }
   T.setClassIndex(map[c]);
   return T;
 }
예제 #30
0
  /**
   * Sets the format of the input instances.
   *
   * @param instanceInfo an Instances object containing the input instance structure (any instances
   *     contained in the object are ignored - only the structure is required).
   * @return true if the outputFormat may be collected immediately
   * @throws Exception if the input format can't be set successfully
   */
  public boolean setInputFormat(Instances instanceInfo) throws Exception {

    super.setInputFormat(instanceInfo);
    m_AttIndex.setUpper(instanceInfo.numAttributes() - 1);
    m_FirstIndex.setUpper(instanceInfo.attribute(m_AttIndex.getIndex()).numValues() - 1);
    m_SecondIndex.setUpper(instanceInfo.attribute(m_AttIndex.getIndex()).numValues() - 1);
    if ((instanceInfo.classIndex() > -1) && (instanceInfo.classIndex() == m_AttIndex.getIndex())) {
      throw new Exception("Cannot process class attribute.");
    }
    if (!instanceInfo.attribute(m_AttIndex.getIndex()).isNominal()) {
      throw new UnsupportedAttributeTypeException("Chosen attribute not nominal.");
    }
    if (instanceInfo.attribute(m_AttIndex.getIndex()).numValues() < 2) {
      throw new UnsupportedAttributeTypeException(
          "Chosen attribute has less than " + "two values.");
    }
    if (m_SecondIndex.getIndex() <= m_FirstIndex.getIndex()) {
      // XXX Maybe we should just swap the values??
      throw new Exception("The second index has to be greater " + "than the first.");
    }
    setOutputFormat();
    return true;
  }