public Instances transformInstances(MultiLabelInstances mlData) throws Exception {
    labelIndices = mlData.getLabelIndices();
    numOfLabels = mlData.getNumLabels();
    Instances data = mlData.getDataSet();

    Instances transformed = new Instances(mlData.getDataSet(), 0);

    // delete all labels
    transformed = RemoveAllLabels.transformInstances(transformed, labelIndices);

    // add single label attribute
    ArrayList<String> classValues = new ArrayList<String>(numOfLabels);
    for (int x = 0; x < numOfLabels; x++) {
      classValues.add("Class" + (x + 1));
    }
    Attribute newClass = new Attribute("Class", classValues);
    transformed.insertAttributeAt(newClass, transformed.numAttributes());
    transformed.setClassIndex(transformed.numAttributes() - 1);

    for (int instanceIndex = 0; instanceIndex < data.numInstances(); instanceIndex++) {
      // System.out.println(data.instance(instanceIndex).toString());
      List<Instance> result = transformInstance(data.instance(instanceIndex));
      for (Instance instance : result) {
        // System.out.println(instance.toString());
        transformed.add(instance);
        // System.out.println(transformed.instance(transformed.numInstances()-1));
      }
    }
    return transformed;
  }
  @Override
  protected Instances process(Instances instances) throws Exception {
    instances.insertAttributeAt(getIndicatorAttribute(), instances.numAttributes() - 1);

    if (m_FirstBatchDone == false) {
      searchMedian(instances);
    }
    imputeMedian(instances);

    return instances;
  }
Exemple #3
0
  /**
   * processes the instances using the HAAR algorithm
   *
   * @param instances the data to process
   * @return the modified data
   * @throws Exception in case the processing goes wrong
   */
  protected Instances processHAAR(Instances instances) throws Exception {
    Instances result;
    int i;
    int n;
    int j;
    int clsIdx;
    double[] oldVal;
    double[] newVal;
    int level;
    int length;
    double[] clsVal;
    Attribute clsAtt;

    clsIdx = instances.classIndex();
    clsVal = null;
    clsAtt = null;
    if (clsIdx > -1) {
      clsVal = instances.attributeToDoubleArray(clsIdx);
      clsAtt = (Attribute) instances.classAttribute().copy();
      instances.setClassIndex(-1);
      instances.deleteAttributeAt(clsIdx);
    }
    result = new Instances(instances, 0);
    level = (int) StrictMath.ceil(StrictMath.log(instances.numAttributes()) / StrictMath.log(2.0));

    for (i = 0; i < instances.numInstances(); i++) {
      oldVal = instances.instance(i).toDoubleArray();
      newVal = new double[oldVal.length];

      for (n = level; n > 0; n--) {
        length = (int) StrictMath.pow(2, n - 1);

        for (j = 0; j < length; j++) {
          newVal[j] = (oldVal[j * 2] + oldVal[j * 2 + 1]) / StrictMath.sqrt(2);
          newVal[j + length] = (oldVal[j * 2] - oldVal[j * 2 + 1]) / StrictMath.sqrt(2);
        }

        System.arraycopy(newVal, 0, oldVal, 0, newVal.length);
      }

      // add new transformed instance
      result.add(new DenseInstance(1, newVal));
    }

    // add class again
    if (clsIdx > -1) {
      result.insertAttributeAt(clsAtt, clsIdx);
      result.setClassIndex(clsIdx);
      for (i = 0; i < clsVal.length; i++) result.instance(i).setClassValue(clsVal[i]);
    }

    return result;
  }
Exemple #4
0
  /**
   * Sets the format of the input instances.
   *
   * @param instanceInfo an Instances object containing the input instance structure (any instances
   *     contained in the object are ignored - only the structure is required).
   * @return true if the outputFormat may be collected immediately
   * @exception Exception if the format couldn't be set successfully
   */
  public boolean setInputFormat(Instances instanceInfo) throws Exception {

    convertInfixToPostfix(new String(m_infixExpression));

    super.setInputFormat(instanceInfo);

    Instances outputFormat = new Instances(instanceInfo, 0);
    Attribute newAttribute;
    if (m_Debug) {
      newAttribute = new Attribute(m_postFixExpVector.toString());
    } else if (m_attributeName.compareTo("expression") != 0) {
      newAttribute = new Attribute(m_attributeName);
    } else {
      newAttribute = new Attribute(m_infixExpression);
    }
    outputFormat.insertAttributeAt(newAttribute, instanceInfo.numAttributes());
    setOutputFormat(outputFormat);
    return true;
  }
Exemple #5
0
  /**
   * Sets the format of the input instances.
   *
   * @param instanceInfo an Instances object containing the input instance structure (any instances
   *     contained in the object are ignored - only the structure is required).
   * @return true if the outputFormat may be collected immediately
   * @throws Exception if the format couldn't be set successfully
   */
  public boolean setInputFormat(Instances instanceInfo) throws Exception {

    super.setInputFormat(instanceInfo);

    m_Insert.setUpper(instanceInfo.numAttributes());
    Instances outputFormat = new Instances(instanceInfo, 0);
    Attribute newAttribute = null;
    switch (m_AttributeType) {
      case Attribute.NUMERIC:
        newAttribute = new Attribute(m_Name);
        break;
      case Attribute.NOMINAL:
        newAttribute = new Attribute(m_Name, m_Labels);
        break;
      case Attribute.STRING:
        newAttribute = new Attribute(m_Name, (FastVector) null);
        break;
      case Attribute.DATE:
        newAttribute = new Attribute(m_Name, m_DateFormat);
        break;
      default:
        throw new IllegalArgumentException("Unknown attribute type in Add");
    }

    if ((m_Insert.getIndex() < 0) || (m_Insert.getIndex() > getInputFormat().numAttributes())) {
      throw new IllegalArgumentException("Index out of range");
    }
    outputFormat.insertAttributeAt(newAttribute, m_Insert.getIndex());
    setOutputFormat(outputFormat);

    // all attributes, except index of added attribute
    // (otherwise the length of the input/output indices differ)
    Range atts = new Range(m_Insert.getSingleIndex());
    atts.setInvert(true);
    atts.setUpper(outputFormat.numAttributes() - 1);
    initOutputLocators(outputFormat, atts.getSelection());

    return true;
  }
Exemple #6
0
  /**
   * pads the data to conform to the necessary number of attributes
   *
   * @param data the data to pad
   * @return the padded data
   */
  protected Instances pad(Instances data) {
    Instances result;
    int i;
    int n;
    String prefix;
    int numAtts;
    boolean isLast;
    int index;
    Vector<Integer> padded;
    int[] indices;
    FastVector atts;

    // determine number of padding attributes
    switch (m_Padding) {
      case PADDING_ZERO:
        if (data.classIndex() > -1)
          numAtts = (nextPowerOf2(data.numAttributes() - 1) + 1) - data.numAttributes();
        else numAtts = nextPowerOf2(data.numAttributes()) - data.numAttributes();
        break;

      default:
        throw new IllegalStateException(
            "Padding " + new SelectedTag(m_Algorithm, TAGS_PADDING) + " not implemented!");
    }

    result = new Instances(data);
    prefix = getAlgorithm().getSelectedTag().getReadable();

    // any padding necessary?
    if (numAtts > 0) {
      // add padding attributes
      isLast = (data.classIndex() == data.numAttributes() - 1);
      padded = new Vector<Integer>();
      for (i = 0; i < numAtts; i++) {
        if (isLast) index = result.numAttributes() - 1;
        else index = result.numAttributes();

        result.insertAttributeAt(new Attribute(prefix + "_padding_" + (i + 1)), index);

        // record index
        padded.add(new Integer(index));
      }

      // get padded indices
      indices = new int[padded.size()];
      for (i = 0; i < padded.size(); i++) indices[i] = padded.get(i);

      // determine number of padding attributes
      switch (m_Padding) {
        case PADDING_ZERO:
          for (i = 0; i < result.numInstances(); i++) {
            for (n = 0; n < indices.length; n++) result.instance(i).setValue(indices[n], 0);
          }
          break;
      }
    }

    // rename all attributes apart from class
    data = result;
    atts = new FastVector();
    n = 0;
    for (i = 0; i < data.numAttributes(); i++) {
      n++;
      if (i == data.classIndex()) atts.addElement((Attribute) data.attribute(i).copy());
      else atts.addElement(new Attribute(prefix + "_" + n));
    }

    // create new dataset
    result = new Instances(data.relationName(), atts, data.numInstances());
    result.setClassIndex(data.classIndex());
    for (i = 0; i < data.numInstances(); i++)
      result.add(new DenseInstance(1.0, data.instance(i).toDoubleArray()));

    return result;
  }
  public void generateDataSet() {

    // Read all the instances in the file (ARFF, CSV, XRFF, ...)
    try {
      source = new DataSource("data\\bne.csv");
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    // Create data set
    try {
      instances = source.getDataSet();
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    // Reverse the order of instances in the data set to place them in
    // chronological order
    for (int i = 0; i < (instances.numInstances() / 2); i++) {
      instances.swap(i, instances.numInstances() - 1 - i);
    }

    // Remove "volume", "low price", "high price", "opening price" and
    // "data" from data set
    instances.deleteAttributeAt(instances.numAttributes() - 1);
    instances.deleteAttributeAt(instances.numAttributes() - 2);
    instances.deleteAttributeAt(instances.numAttributes() - 2);
    instances.deleteAttributeAt(instances.numAttributes() - 2);
    instances.deleteAttributeAt(instances.numAttributes() - 2);

    // Create list to hold nominal values "purchase", "sale", "retain"
    List my_nominal_values = new ArrayList(3);
    my_nominal_values.add("purchase");
    my_nominal_values.add("sale");
    my_nominal_values.add("retain");

    // Create nominal attribute "classIndex"
    Attribute classIndex = new Attribute("classIndex", my_nominal_values);

    // Add "classIndex" as an attribute to each instance
    instances.insertAttributeAt(classIndex, instances.numAttributes());

    // Set the value of "classIndex" for each instance
    for (int i = 0; i < instances.numInstances() - 1; i++) {
      if (instances.get(i + 1).value(instances.numAttributes() - 2)
          > instances.get(i).value(instances.numAttributes() - 2)) {
        instances.get(i).setValue(instances.numAttributes() - 1, "purchase");
      } else if (instances.get(i + 1).value(instances.numAttributes() - 2)
          < instances.get(i).value(instances.numAttributes() - 2)) {
        instances.get(i).setValue(instances.numAttributes() - 1, "sale");
      } else if (instances.get(i + 1).value(instances.numAttributes() - 2)
          == instances.get(i).value(instances.numAttributes() - 2)) {
        instances.get(i).setValue(instances.numAttributes() - 1, "retain");
      }
    }

    // Make the last attribute be the class
    instances.setClassIndex(instances.numAttributes() - 1);

    // Calculate and insert technical analysis attributes into data set
    Strategies strategies = new Strategies();
    strategies.applyStrategies();

    // Print header and instances
    System.out.println("\nDataset:\n");
    System.out.println(instances);
    System.out.println(instances.numInstances());
  }
  /**
   * ************************************************** Convert a table to a set of instances, with
   * <b>columns</b> representing individual </b>instances</b> and <b>rows</b> representing
   * <b>attributes</b> (e.g. as is common with microarray data)
   */
  public Instances tableColsToInstances(Table t, String relationName) {

    System.err.print("Converting table cols to instances...");

    // Set up attributes, which for colInstances will be the rowNames...
    FastVector atts = new FastVector();
    ArrayList<Boolean> isNominal = new ArrayList<Boolean>();
    ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later...

    System.err.print("creating attributes...");

    for (int r = 0; r < t.numRows; r++) {
      if (rowIsNumeric(t, r)) {
        isNominal.add(false);
        atts.addElement(new Attribute(t.rowNames[r]));
        allAttVals.add(null); // No enumeration of attribute values.
      } else {
        // It's nominal... determine the range of values and create a nominal attribute...
        isNominal.add(true);
        FastVector attVals = getRowValues(t, r);
        atts.addElement(new Attribute(t.rowNames[r], attVals));
        // Save it for later
        allAttVals.add(attVals);
      }
    }

    System.err.print("creating instances...");

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    /** ***** CREATE INSTANCES ************* */
    // Fill the instances with data...
    // For each instance...
    for (int c = 0; c < t.numCols; c++) {
      double[] vals =
          new double[data.numAttributes()]; // Even nominal values are stored as double pointers.

      // For each attribute fill in the numeric or attributeValue index...
      for (int r = 0; r < t.numRows; r++) {
        String val = (String) t.matrix.getQuick(r, c);
        if (val == "?") vals[r] = Instance.missingValue();
        else if (isNominal.get(r)) {
          vals[r] = allAttVals.get(r).indexOf(val);
        } else {
          vals[r] = Double.parseDouble((String) val);
        }
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    System.err.print("add feature names...");

    /** ***** ADD FEATURE NAMES ************* */
    // takes basically zero time... all time is in previous 2 chunks.
    if (addInstanceNamesAsFeatures) {
      Instances newData = new Instances(data);
      newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
      int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

      // We save the instanceNames in a list because it's handy later on...
      instanceNames = new ArrayList<String>();

      for (int c = 0; c < t.colNames.length; c++) {
        instanceNames.add(t.colNames[c]);
        newData.instance(c).setValue(attrIdx, t.colNames[c]);
      }
      data = newData;
    }

    System.err.println("done.");

    return (data);
  }
  /**
   * ************************************************** Convert a table to a set of instances, with
   * <b>rows</b> representing individual </b>instances</b> and <b>columns</b> representing
   * <b>attributes</b>
   */
  public Instances tableRowsToNominalInstances(Table t, String relationName) {

    System.err.print("Converting table rows to instances...");

    // Set up attributes, which for rowInstances will be the colNames...
    FastVector atts = new FastVector();
    ArrayList<Boolean> isNominal = new ArrayList<Boolean>();
    ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later...			

    System.err.print("creating attributes...");

    for (int c = 0; c < t.numCols; c++) {
      // It's nominal... determine the range of values
      isNominal.add(true);
      FastVector attVals = getColValues(t, c);
      atts.addElement(new Attribute(t.colNames[c], attVals));
      // Save it for later
      allAttVals.add(attVals);
    }

    System.err.print("creating instances...");

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    // Fill the instances with data...
    // For each instance...
    for (int r = 0; r < t.numRows; r++) {
      double[] vals = new double[data.numAttributes()];

      // for each attribute
      for (int c = 0; c < t.numCols; c++) {
        String val = (String) t.matrix.getQuick(r, c);
        if (val == "?") vals[c] = Instance.missingValue();
        else if (isNominal.get(c)) {
          vals[c] = allAttVals.get(c).indexOf(val);
        } else {
          vals[c] = Double.parseDouble((String) val);
        }
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    System.err.print("add feature names...");

    if (addInstanceNamesAsFeatures) {
      Instances newData = new Instances(data);
      newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
      int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

      // We save the instanceNames in a list because it's handy later on...
      instanceNames = new ArrayList<String>();

      for (int r = 0; r < t.rowNames.length; r++) {
        instanceNames.add(t.rowNames[r]);
        newData.instance(r).setValue(attrIdx, t.rowNames[r]);
      }
      data = newData;
    }

    System.err.println("done.");

    return (data);
  }
  /**
   * If we know in advance that the table is numeric, can optimize a lot... For example, on 9803 x
   * 294 table, TableFileLoader.readNumeric takes 6s compared to 12s for WekaMine readFromTable.
   */
  public static Instances readNumeric(String fileName, String relationName, String delimiter)
      throws Exception {

    int numAttributes = FileUtils.fastCountLines(fileName) - 1; // -1 exclude heading.
    String[] attrNames = new String[numAttributes];

    // Read the col headings and figure out the number of columns in the table..
    BufferedReader reader = new BufferedReader(new FileReader(fileName), 4194304);
    String line = reader.readLine();
    String[] instanceNames = parseColNames(line, delimiter);
    int numInstances = instanceNames.length;

    System.err.print("reading " + numAttributes + " x " + numInstances + " table..");

    // Create an array to hold the data as we read it in...
    double dataArray[][] = new double[numAttributes][numInstances];

    // Populate the matrix with values...
    String valToken = "";
    try {
      int rowIdx = 0;
      while ((line = reader.readLine()) != null) {

        String[] tokens = line.split(delimiter, -1);
        attrNames[rowIdx] = tokens[0].trim();
        for (int colIdx = 0; colIdx < (tokens.length - 1); colIdx++) {
          valToken = tokens[colIdx + 1];
          double value;

          if (valToken.equals("null")) {
            value = Instance.missingValue();
          } else if (valToken.equals("?")) {
            value = Instance.missingValue();
          } else if (valToken.equals("NA")) {
            value = Instance.missingValue();
          } else if (valToken.equals("")) {
            value = Instance.missingValue();
            // }else value = DoubleParser.lightningParse(valToken); // faster double parser with
            // MANY assumptions
          } else value = Double.parseDouble(valToken);
          dataArray[rowIdx][colIdx] = value;
        }
        rowIdx++;
      }
    } catch (NumberFormatException e) {
      System.err.println(e.toString());
      System.err.println("Parsing line: " + line);
      System.err.println("Parsing token: " + valToken);
    }

    // Set up attributes, which for colInstances will be the rowNames...
    FastVector atts = new FastVector();
    for (int a = 0; a < numAttributes; a++) {
      atts.addElement(new Attribute(attrNames[a]));
    }

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    System.err.print("creating instances..");

    // System.err.println("DEBUG: numAttributes "+numAttributes);

    /** ***** CREATE INSTANCES ************* */
    // Fill the instances with data...
    // For each instance...
    for (int c = 0; c < numInstances; c++) {
      double[] vals =
          new double[data.numAttributes()]; // Even nominal values are stored as double pointers.

      for (int r = 0; r < numAttributes; r++) {
        double val = dataArray[r][c];
        vals[r] = val;
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    // System.err.println("DEBUG: data.numInstances: "+data.numInstances());
    // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes());
    // System.err.println("DEBUG: data.relationNAme"+data.relationName());
    System.err.print("add feature names..");

    /** ***** ADD FEATURE NAMES ************* */
    // takes basically zero time... all time is in previous 2 chunks.
    Instances newData = new Instances(data);
    newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
    int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

    for (int c = 0; c < numInstances; c++) {
      newData.instance(c).setValue(attrIdx, instanceNames[c]);
    }
    data = newData;

    // System.err.println("DEBUG: data.numInstances: "+data.numInstances());
    // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes());

    return (data);
  }
 @Override
 protected Instances determineOutputFormat(Instances inputFormat) throws Exception {
   inputFormat.insertAttributeAt(getIndicatorAttribute(), inputFormat.numAttributes() - 1);
   return inputFormat;
 }