public Instances transformInstances(MultiLabelInstances mlData) throws Exception { labelIndices = mlData.getLabelIndices(); numOfLabels = mlData.getNumLabels(); Instances data = mlData.getDataSet(); Instances transformed = new Instances(mlData.getDataSet(), 0); // delete all labels transformed = RemoveAllLabels.transformInstances(transformed, labelIndices); // add single label attribute ArrayList<String> classValues = new ArrayList<String>(numOfLabels); for (int x = 0; x < numOfLabels; x++) { classValues.add("Class" + (x + 1)); } Attribute newClass = new Attribute("Class", classValues); transformed.insertAttributeAt(newClass, transformed.numAttributes()); transformed.setClassIndex(transformed.numAttributes() - 1); for (int instanceIndex = 0; instanceIndex < data.numInstances(); instanceIndex++) { // System.out.println(data.instance(instanceIndex).toString()); List<Instance> result = transformInstance(data.instance(instanceIndex)); for (Instance instance : result) { // System.out.println(instance.toString()); transformed.add(instance); // System.out.println(transformed.instance(transformed.numInstances()-1)); } } return transformed; }
@Override protected Instances process(Instances instances) throws Exception { instances.insertAttributeAt(getIndicatorAttribute(), instances.numAttributes() - 1); if (m_FirstBatchDone == false) { searchMedian(instances); } imputeMedian(instances); return instances; }
/** * processes the instances using the HAAR algorithm * * @param instances the data to process * @return the modified data * @throws Exception in case the processing goes wrong */ protected Instances processHAAR(Instances instances) throws Exception { Instances result; int i; int n; int j; int clsIdx; double[] oldVal; double[] newVal; int level; int length; double[] clsVal; Attribute clsAtt; clsIdx = instances.classIndex(); clsVal = null; clsAtt = null; if (clsIdx > -1) { clsVal = instances.attributeToDoubleArray(clsIdx); clsAtt = (Attribute) instances.classAttribute().copy(); instances.setClassIndex(-1); instances.deleteAttributeAt(clsIdx); } result = new Instances(instances, 0); level = (int) StrictMath.ceil(StrictMath.log(instances.numAttributes()) / StrictMath.log(2.0)); for (i = 0; i < instances.numInstances(); i++) { oldVal = instances.instance(i).toDoubleArray(); newVal = new double[oldVal.length]; for (n = level; n > 0; n--) { length = (int) StrictMath.pow(2, n - 1); for (j = 0; j < length; j++) { newVal[j] = (oldVal[j * 2] + oldVal[j * 2 + 1]) / StrictMath.sqrt(2); newVal[j + length] = (oldVal[j * 2] - oldVal[j * 2 + 1]) / StrictMath.sqrt(2); } System.arraycopy(newVal, 0, oldVal, 0, newVal.length); } // add new transformed instance result.add(new DenseInstance(1, newVal)); } // add class again if (clsIdx > -1) { result.insertAttributeAt(clsAtt, clsIdx); result.setClassIndex(clsIdx); for (i = 0; i < clsVal.length; i++) result.instance(i).setClassValue(clsVal[i]); } return result; }
/** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance structure (any instances * contained in the object are ignored - only the structure is required). * @return true if the outputFormat may be collected immediately * @exception Exception if the format couldn't be set successfully */ public boolean setInputFormat(Instances instanceInfo) throws Exception { convertInfixToPostfix(new String(m_infixExpression)); super.setInputFormat(instanceInfo); Instances outputFormat = new Instances(instanceInfo, 0); Attribute newAttribute; if (m_Debug) { newAttribute = new Attribute(m_postFixExpVector.toString()); } else if (m_attributeName.compareTo("expression") != 0) { newAttribute = new Attribute(m_attributeName); } else { newAttribute = new Attribute(m_infixExpression); } outputFormat.insertAttributeAt(newAttribute, instanceInfo.numAttributes()); setOutputFormat(outputFormat); return true; }
/** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance structure (any instances * contained in the object are ignored - only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the format couldn't be set successfully */ public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); m_Insert.setUpper(instanceInfo.numAttributes()); Instances outputFormat = new Instances(instanceInfo, 0); Attribute newAttribute = null; switch (m_AttributeType) { case Attribute.NUMERIC: newAttribute = new Attribute(m_Name); break; case Attribute.NOMINAL: newAttribute = new Attribute(m_Name, m_Labels); break; case Attribute.STRING: newAttribute = new Attribute(m_Name, (FastVector) null); break; case Attribute.DATE: newAttribute = new Attribute(m_Name, m_DateFormat); break; default: throw new IllegalArgumentException("Unknown attribute type in Add"); } if ((m_Insert.getIndex() < 0) || (m_Insert.getIndex() > getInputFormat().numAttributes())) { throw new IllegalArgumentException("Index out of range"); } outputFormat.insertAttributeAt(newAttribute, m_Insert.getIndex()); setOutputFormat(outputFormat); // all attributes, except index of added attribute // (otherwise the length of the input/output indices differ) Range atts = new Range(m_Insert.getSingleIndex()); atts.setInvert(true); atts.setUpper(outputFormat.numAttributes() - 1); initOutputLocators(outputFormat, atts.getSelection()); return true; }
/** * pads the data to conform to the necessary number of attributes * * @param data the data to pad * @return the padded data */ protected Instances pad(Instances data) { Instances result; int i; int n; String prefix; int numAtts; boolean isLast; int index; Vector<Integer> padded; int[] indices; FastVector atts; // determine number of padding attributes switch (m_Padding) { case PADDING_ZERO: if (data.classIndex() > -1) numAtts = (nextPowerOf2(data.numAttributes() - 1) + 1) - data.numAttributes(); else numAtts = nextPowerOf2(data.numAttributes()) - data.numAttributes(); break; default: throw new IllegalStateException( "Padding " + new SelectedTag(m_Algorithm, TAGS_PADDING) + " not implemented!"); } result = new Instances(data); prefix = getAlgorithm().getSelectedTag().getReadable(); // any padding necessary? if (numAtts > 0) { // add padding attributes isLast = (data.classIndex() == data.numAttributes() - 1); padded = new Vector<Integer>(); for (i = 0; i < numAtts; i++) { if (isLast) index = result.numAttributes() - 1; else index = result.numAttributes(); result.insertAttributeAt(new Attribute(prefix + "_padding_" + (i + 1)), index); // record index padded.add(new Integer(index)); } // get padded indices indices = new int[padded.size()]; for (i = 0; i < padded.size(); i++) indices[i] = padded.get(i); // determine number of padding attributes switch (m_Padding) { case PADDING_ZERO: for (i = 0; i < result.numInstances(); i++) { for (n = 0; n < indices.length; n++) result.instance(i).setValue(indices[n], 0); } break; } } // rename all attributes apart from class data = result; atts = new FastVector(); n = 0; for (i = 0; i < data.numAttributes(); i++) { n++; if (i == data.classIndex()) atts.addElement((Attribute) data.attribute(i).copy()); else atts.addElement(new Attribute(prefix + "_" + n)); } // create new dataset result = new Instances(data.relationName(), atts, data.numInstances()); result.setClassIndex(data.classIndex()); for (i = 0; i < data.numInstances(); i++) result.add(new DenseInstance(1.0, data.instance(i).toDoubleArray())); return result; }
public void generateDataSet() { // Read all the instances in the file (ARFF, CSV, XRFF, ...) try { source = new DataSource("data\\bne.csv"); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } // Create data set try { instances = source.getDataSet(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } // Reverse the order of instances in the data set to place them in // chronological order for (int i = 0; i < (instances.numInstances() / 2); i++) { instances.swap(i, instances.numInstances() - 1 - i); } // Remove "volume", "low price", "high price", "opening price" and // "data" from data set instances.deleteAttributeAt(instances.numAttributes() - 1); instances.deleteAttributeAt(instances.numAttributes() - 2); instances.deleteAttributeAt(instances.numAttributes() - 2); instances.deleteAttributeAt(instances.numAttributes() - 2); instances.deleteAttributeAt(instances.numAttributes() - 2); // Create list to hold nominal values "purchase", "sale", "retain" List my_nominal_values = new ArrayList(3); my_nominal_values.add("purchase"); my_nominal_values.add("sale"); my_nominal_values.add("retain"); // Create nominal attribute "classIndex" Attribute classIndex = new Attribute("classIndex", my_nominal_values); // Add "classIndex" as an attribute to each instance instances.insertAttributeAt(classIndex, instances.numAttributes()); // Set the value of "classIndex" for each instance for (int i = 0; i < instances.numInstances() - 1; i++) { if (instances.get(i + 1).value(instances.numAttributes() - 2) > instances.get(i).value(instances.numAttributes() - 2)) { instances.get(i).setValue(instances.numAttributes() - 1, "purchase"); } else if (instances.get(i + 1).value(instances.numAttributes() - 2) < instances.get(i).value(instances.numAttributes() - 2)) { instances.get(i).setValue(instances.numAttributes() - 1, "sale"); } else if (instances.get(i + 1).value(instances.numAttributes() - 2) == instances.get(i).value(instances.numAttributes() - 2)) { instances.get(i).setValue(instances.numAttributes() - 1, "retain"); } } // Make the last attribute be the class instances.setClassIndex(instances.numAttributes() - 1); // Calculate and insert technical analysis attributes into data set Strategies strategies = new Strategies(); strategies.applyStrategies(); // Print header and instances System.out.println("\nDataset:\n"); System.out.println(instances); System.out.println(instances.numInstances()); }
/** * ************************************************** Convert a table to a set of instances, with * <b>columns</b> representing individual </b>instances</b> and <b>rows</b> representing * <b>attributes</b> (e.g. as is common with microarray data) */ public Instances tableColsToInstances(Table t, String relationName) { System.err.print("Converting table cols to instances..."); // Set up attributes, which for colInstances will be the rowNames... FastVector atts = new FastVector(); ArrayList<Boolean> isNominal = new ArrayList<Boolean>(); ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later... System.err.print("creating attributes..."); for (int r = 0; r < t.numRows; r++) { if (rowIsNumeric(t, r)) { isNominal.add(false); atts.addElement(new Attribute(t.rowNames[r])); allAttVals.add(null); // No enumeration of attribute values. } else { // It's nominal... determine the range of values and create a nominal attribute... isNominal.add(true); FastVector attVals = getRowValues(t, r); atts.addElement(new Attribute(t.rowNames[r], attVals)); // Save it for later allAttVals.add(attVals); } } System.err.print("creating instances..."); // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); /** ***** CREATE INSTANCES ************* */ // Fill the instances with data... // For each instance... for (int c = 0; c < t.numCols; c++) { double[] vals = new double[data.numAttributes()]; // Even nominal values are stored as double pointers. // For each attribute fill in the numeric or attributeValue index... for (int r = 0; r < t.numRows; r++) { String val = (String) t.matrix.getQuick(r, c); if (val == "?") vals[r] = Instance.missingValue(); else if (isNominal.get(r)) { vals[r] = allAttVals.get(r).indexOf(val); } else { vals[r] = Double.parseDouble((String) val); } } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } System.err.print("add feature names..."); /** ***** ADD FEATURE NAMES ************* */ // takes basically zero time... all time is in previous 2 chunks. if (addInstanceNamesAsFeatures) { Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 // We save the instanceNames in a list because it's handy later on... instanceNames = new ArrayList<String>(); for (int c = 0; c < t.colNames.length; c++) { instanceNames.add(t.colNames[c]); newData.instance(c).setValue(attrIdx, t.colNames[c]); } data = newData; } System.err.println("done."); return (data); }
/** * ************************************************** Convert a table to a set of instances, with * <b>rows</b> representing individual </b>instances</b> and <b>columns</b> representing * <b>attributes</b> */ public Instances tableRowsToNominalInstances(Table t, String relationName) { System.err.print("Converting table rows to instances..."); // Set up attributes, which for rowInstances will be the colNames... FastVector atts = new FastVector(); ArrayList<Boolean> isNominal = new ArrayList<Boolean>(); ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later... System.err.print("creating attributes..."); for (int c = 0; c < t.numCols; c++) { // It's nominal... determine the range of values isNominal.add(true); FastVector attVals = getColValues(t, c); atts.addElement(new Attribute(t.colNames[c], attVals)); // Save it for later allAttVals.add(attVals); } System.err.print("creating instances..."); // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); // Fill the instances with data... // For each instance... for (int r = 0; r < t.numRows; r++) { double[] vals = new double[data.numAttributes()]; // for each attribute for (int c = 0; c < t.numCols; c++) { String val = (String) t.matrix.getQuick(r, c); if (val == "?") vals[c] = Instance.missingValue(); else if (isNominal.get(c)) { vals[c] = allAttVals.get(c).indexOf(val); } else { vals[c] = Double.parseDouble((String) val); } } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } System.err.print("add feature names..."); if (addInstanceNamesAsFeatures) { Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 // We save the instanceNames in a list because it's handy later on... instanceNames = new ArrayList<String>(); for (int r = 0; r < t.rowNames.length; r++) { instanceNames.add(t.rowNames[r]); newData.instance(r).setValue(attrIdx, t.rowNames[r]); } data = newData; } System.err.println("done."); return (data); }
/** * If we know in advance that the table is numeric, can optimize a lot... For example, on 9803 x * 294 table, TableFileLoader.readNumeric takes 6s compared to 12s for WekaMine readFromTable. */ public static Instances readNumeric(String fileName, String relationName, String delimiter) throws Exception { int numAttributes = FileUtils.fastCountLines(fileName) - 1; // -1 exclude heading. String[] attrNames = new String[numAttributes]; // Read the col headings and figure out the number of columns in the table.. BufferedReader reader = new BufferedReader(new FileReader(fileName), 4194304); String line = reader.readLine(); String[] instanceNames = parseColNames(line, delimiter); int numInstances = instanceNames.length; System.err.print("reading " + numAttributes + " x " + numInstances + " table.."); // Create an array to hold the data as we read it in... double dataArray[][] = new double[numAttributes][numInstances]; // Populate the matrix with values... String valToken = ""; try { int rowIdx = 0; while ((line = reader.readLine()) != null) { String[] tokens = line.split(delimiter, -1); attrNames[rowIdx] = tokens[0].trim(); for (int colIdx = 0; colIdx < (tokens.length - 1); colIdx++) { valToken = tokens[colIdx + 1]; double value; if (valToken.equals("null")) { value = Instance.missingValue(); } else if (valToken.equals("?")) { value = Instance.missingValue(); } else if (valToken.equals("NA")) { value = Instance.missingValue(); } else if (valToken.equals("")) { value = Instance.missingValue(); // }else value = DoubleParser.lightningParse(valToken); // faster double parser with // MANY assumptions } else value = Double.parseDouble(valToken); dataArray[rowIdx][colIdx] = value; } rowIdx++; } } catch (NumberFormatException e) { System.err.println(e.toString()); System.err.println("Parsing line: " + line); System.err.println("Parsing token: " + valToken); } // Set up attributes, which for colInstances will be the rowNames... FastVector atts = new FastVector(); for (int a = 0; a < numAttributes; a++) { atts.addElement(new Attribute(attrNames[a])); } // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); System.err.print("creating instances.."); // System.err.println("DEBUG: numAttributes "+numAttributes); /** ***** CREATE INSTANCES ************* */ // Fill the instances with data... // For each instance... for (int c = 0; c < numInstances; c++) { double[] vals = new double[data.numAttributes()]; // Even nominal values are stored as double pointers. for (int r = 0; r < numAttributes; r++) { double val = dataArray[r][c]; vals[r] = val; } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } // System.err.println("DEBUG: data.numInstances: "+data.numInstances()); // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes()); // System.err.println("DEBUG: data.relationNAme"+data.relationName()); System.err.print("add feature names.."); /** ***** ADD FEATURE NAMES ************* */ // takes basically zero time... all time is in previous 2 chunks. Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 for (int c = 0; c < numInstances; c++) { newData.instance(c).setValue(attrIdx, instanceNames[c]); } data = newData; // System.err.println("DEBUG: data.numInstances: "+data.numInstances()); // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes()); return (data); }
@Override protected Instances determineOutputFormat(Instances inputFormat) throws Exception { inputFormat.insertAttributeAt(getIndicatorAttribute(), inputFormat.numAttributes() - 1); return inputFormat; }