public weka.core.Instances toWekaInstances() { // attributes FastVector wattrs = new FastVector(); Iterator itr = attributes.iterator(); while (itr.hasNext()) { Attribute attr = (Attribute) itr.next(); wattrs.addElement(attr.toWekaAttribute()); } // data instances weka.core.Instances winsts = new weka.core.Instances(name, wattrs, instances.size()); itr = instances.iterator(); while (itr.hasNext()) { Instance inst = (Instance) itr.next(); Iterator itrval = inst.getValues().iterator(); Iterator itrmis = inst.getMissing().iterator(); double[] vals = new double[wattrs.size()]; for (int i = 0; i < wattrs.size(); i++) { double val = (Double) itrval.next(); if ((Boolean) itrmis.next()) { vals[i] = weka.core.Instance.missingValue(); } else { vals[i] = val; } } weka.core.Instance winst = new weka.core.Instance(1, vals); winst.setDataset(winsts); winsts.add(winst); } winsts.setClassIndex(this.class_index); return winsts; }
/** * Input an instance for filtering. Ordinarily the instance is processed and made available for * output immediately. Some filters require all instances be read before producing output. * * @param instance the input instance * @return true if the filtered instance may now be collected with output(). * @exception IllegalStateException if no input format has been defined. * @exception Exception if there was a problem during the filtering. */ public boolean input(Instance instance) throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } double[] vals = new double[instance.numAttributes() + 1]; for (int i = 0; i < instance.numAttributes(); i++) { if (instance.isMissing(i)) { vals[i] = Instance.missingValue(); } else { vals[i] = instance.value(i); } } evaluateExpression(vals); Instance inst = null; if (instance instanceof SparseInstance) { inst = new SparseInstance(instance.weight(), vals); } else { inst = new Instance(instance.weight(), vals); } copyStringValues(inst, false, instance.dataset(), getOutputFormat()); inst.setDataset(getOutputFormat()); push(inst); return true; }
// calculate if an App fits to a pm // TODO: gst: use WEKA to calc fit factor!! private int calculateFit(App app2, VirtualMachine vm) { int output = 0; if (Action.isOnlyLearning() == false && CreateAppInsertIntoVm.evaluation != null) { // is free space available in the VM if (app2.getCpu() + vm.getCurrentCpuUsage() < vm.getCurrentCpuAllocation() && app2.getMemory() + vm.getCurrentMemoryUsage() < vm.getCurrentMemoryAllocation() && app2.getStorage() + vm.getCurrentStorageUsage() < vm.getCurrentCpuAllocation()) { Instance instance = createInstance(Instance.missingValue(), vm); instance.setDataset(CreateAppInsertIntoVm.getKnowledgeBase()); try { output = (int) (evaluation.evaluateModelOnce(classifier, instance) * 100); } catch (Exception e) { e.printStackTrace(); } } } else { if (app2.getCpu() + vm.getCurrentCpuUsage() < vm.getCurrentCpuAllocation() && app2.getMemory() + vm.getCurrentMemoryUsage() < vm.getCurrentMemoryAllocation() && app2.getStorage() + vm.getCurrentStorageUsage() < vm.getCurrentCpuAllocation()) { output = randomData.nextInt(1, 100); } } return output; }
// use the TriTrainer Classifier to classify Instance; public double classifyInstance(Instance instance) throws Exception { double result; double[] dist; int index; dist = distributionForInstance(instance); // 分类概率 if (instance.classAttribute().isNominal()) { index = Utils.maxIndex(dist); // 返回概率最大的 if (dist[index] == 0) result = Instance.missingValue(); else result = dist[index]; } else if (instance.classAttribute().isNumeric()) { result = dist[0]; } else { result = Instance.missingValue(); } return result; }
/** * Evaluate the expression using the supplied array of attribute values. The result is stored in * the last element of the array. Assumes that the infix expression has been converted to postfix * and stored in m_postFixExpVector * * @param vals the values to apply the expression to * @exception Exception if something goes wrong */ private void evaluateExpression(double[] vals) throws Exception { Stack operands = new Stack(); for (int i = 0; i < m_postFixExpVector.size(); i++) { Object nextob = m_postFixExpVector.elementAt(i); if (nextob instanceof NumericOperand) { operands.push(new Double(((NumericOperand) nextob).m_numericConst)); } else if (nextob instanceof AttributeOperand) { double value = vals[((AttributeOperand) nextob).m_attributeIndex]; if (value == Instance.missingValue()) { vals[vals.length - 1] = Instance.missingValue(); break; } if (((AttributeOperand) nextob).m_negative) { value = -value; } operands.push(new Double(value)); } else if (nextob instanceof Operator) { char op = ((Operator) nextob).m_operator; if (isUnaryFunction(op)) { double operand = ((Double) operands.pop()).doubleValue(); double result = ((Operator) nextob).applyFunction(operand); operands.push(new Double(result)); } else { double second = ((Double) operands.pop()).doubleValue(); double first = ((Double) operands.pop()).doubleValue(); double result = ((Operator) nextob).applyOperator(first, second); operands.push(new Double(result)); } } else { throw new Exception("Unknown object in postfix vector!"); } } if (operands.size() != 1) { throw new Exception("Problem applying function"); } Double result = ((Double) operands.pop()); if (result.isNaN() || result.isInfinite()) { vals[vals.length - 1] = Instance.missingValue(); } else { vals[vals.length - 1] = result.doubleValue(); } }
private List<Instance> myExtractKeyphrases(String document, int numOfPhrases) throws Exception { // Check whether there is actually any data // if (document.length() == 0 || document.equals("")) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); List<Instance> myInstances = new ArrayList<Instance>(); double[] newInst = new double[2]; newInst[0] = (double) data.attribute(0).addStringValue(document); newInst[1] = Instance.missingValue(); data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); ke.setNumPhrases(numOfPhrases); int numPhrases = numOfPhrases; // ke.getNumPhrases(); Instance[] topRankedInstances = new Instance[numPhrases]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < numPhrases) { topRankedInstances[index] = inst; } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } myInstances.add(topRankedInstances[i]); } } return myInstances; }
/** * Classifies the given test instance. The instance has to belong to a dataset when it's being * classified. Note that a classifier MUST implement either this or distributionForInstance(). * * @param instance the instance to be classified * @return the predicted most likely class for the instance or Instance.missingValue() if no * prediction is made * @exception Exception if an error occurred during the prediction */ public double classifyInstance(Instance instance) throws Exception { double[] dist = distributionForInstance(instance); if (dist == null) { throw new Exception("Null distribution predicted"); } double max = 0; int maxIndex = 0; for (int i = 0; i < dist.length; i++) { if (dist[i] > max) { maxIndex = i; max = dist[i]; } } if (max > 0) { return maxIndex; } else { return Instance.missingValue(); } }
/** * Creates a new instance the same as one instance (the "destination") but with some attribute * values copied from another instance (the "source") * * @param source the source instance * @param dest the destination instance * @return the new merged instance */ protected Instance mergeInstances(Instance source, Instance dest) { Instances outputFormat = outputFormatPeek(); double[] vals = new double[outputFormat.numAttributes()]; for (int i = 0; i < vals.length; i++) { if ((i != outputFormat.classIndex()) && (m_SelectedCols.isInRange(i))) { if (source != null) { vals[i] = source.value(i); } else { vals[i] = Instance.missingValue(); } } else { vals[i] = dest.value(i); } } Instance inst = null; if (dest instanceof SparseInstance) { inst = new SparseInstance(dest.weight(), vals); } else { inst = new Instance(dest.weight(), vals); } inst.setDataset(dest.dataset()); return inst; }
/** * Makes a database query to convert a table into a set of instances * * @param query the query to convert to instances * @return the instances contained in the result of the query, NULL if the SQL query doesn't * return a ResultSet, e.g., DELETE/INSERT/UPDATE * @throws Exception if an error occurs */ public Instances retrieveInstances(String query) throws Exception { if (m_Debug) System.err.println("Executing query: " + query); connectToDatabase(); if (execute(query) == false) { if (m_PreparedStatement.getUpdateCount() == -1) { throw new Exception("Query didn't produce results"); } else { if (m_Debug) System.err.println(m_PreparedStatement.getUpdateCount() + " rows affected."); close(); return null; } } ResultSet rs = getResultSet(); if (m_Debug) System.err.println("Getting metadata..."); ResultSetMetaData md = rs.getMetaData(); if (m_Debug) System.err.println("Completed getting metadata..."); // Determine structure of the instances int numAttributes = md.getColumnCount(); int[] attributeTypes = new int[numAttributes]; Hashtable[] nominalIndexes = new Hashtable[numAttributes]; FastVector[] nominalStrings = new FastVector[numAttributes]; for (int i = 1; i <= numAttributes; i++) { /* switch (md.getColumnType(i)) { case Types.CHAR: case Types.VARCHAR: case Types.LONGVARCHAR: case Types.BINARY: case Types.VARBINARY: case Types.LONGVARBINARY:*/ switch (translateDBColumnType(md.getColumnTypeName(i))) { case STRING: // System.err.println("String --> nominal"); attributeTypes[i - 1] = Attribute.NOMINAL; nominalIndexes[i - 1] = new Hashtable(); nominalStrings[i - 1] = new FastVector(); break; case TEXT: // System.err.println("Text --> string"); attributeTypes[i - 1] = Attribute.STRING; nominalIndexes[i - 1] = new Hashtable(); nominalStrings[i - 1] = new FastVector(); break; case BOOL: // System.err.println("boolean --> nominal"); attributeTypes[i - 1] = Attribute.NOMINAL; nominalIndexes[i - 1] = new Hashtable(); nominalIndexes[i - 1].put("false", new Double(0)); nominalIndexes[i - 1].put("true", new Double(1)); nominalStrings[i - 1] = new FastVector(); nominalStrings[i - 1].addElement("false"); nominalStrings[i - 1].addElement("true"); break; case DOUBLE: // System.err.println("BigDecimal --> numeric"); attributeTypes[i - 1] = Attribute.NUMERIC; break; case BYTE: // System.err.println("byte --> numeric"); attributeTypes[i - 1] = Attribute.NUMERIC; break; case SHORT: // System.err.println("short --> numeric"); attributeTypes[i - 1] = Attribute.NUMERIC; break; case INTEGER: // System.err.println("int --> numeric"); attributeTypes[i - 1] = Attribute.NUMERIC; break; case LONG: // System.err.println("long --> numeric"); attributeTypes[i - 1] = Attribute.NUMERIC; break; case FLOAT: // System.err.println("float --> numeric"); attributeTypes[i - 1] = Attribute.NUMERIC; break; case DATE: attributeTypes[i - 1] = Attribute.DATE; break; case TIME: attributeTypes[i - 1] = Attribute.DATE; break; default: // System.err.println("Unknown column type"); attributeTypes[i - 1] = Attribute.STRING; } } // For sqlite // cache column names because the last while(rs.next()) { iteration for // the tuples below will close the md object: Vector<String> columnNames = new Vector<String>(); for (int i = 0; i < numAttributes; i++) { columnNames.add(md.getColumnName(i + 1)); } // Step through the tuples if (m_Debug) System.err.println("Creating instances..."); FastVector instances = new FastVector(); int rowCount = 0; while (rs.next()) { if (rowCount % 100 == 0) { if (m_Debug) { System.err.print("read " + rowCount + " instances \r"); System.err.flush(); } } double[] vals = new double[numAttributes]; for (int i = 1; i <= numAttributes; i++) { /*switch (md.getColumnType(i)) { case Types.CHAR: case Types.VARCHAR: case Types.LONGVARCHAR: case Types.BINARY: case Types.VARBINARY: case Types.LONGVARBINARY:*/ switch (translateDBColumnType(md.getColumnTypeName(i))) { case STRING: String str = rs.getString(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { Double index = (Double) nominalIndexes[i - 1].get(str); if (index == null) { index = new Double(nominalStrings[i - 1].size()); nominalIndexes[i - 1].put(str, index); nominalStrings[i - 1].addElement(str); } vals[i - 1] = index.doubleValue(); } break; case TEXT: String txt = rs.getString(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { Double index = (Double) nominalIndexes[i - 1].get(txt); if (index == null) { index = new Double(nominalStrings[i - 1].size()); nominalIndexes[i - 1].put(txt, index); nominalStrings[i - 1].addElement(txt); } vals[i - 1] = index.doubleValue(); } break; case BOOL: boolean boo = rs.getBoolean(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { vals[i - 1] = (boo ? 1.0 : 0.0); } break; case DOUBLE: // BigDecimal bd = rs.getBigDecimal(i, 4); double dd = rs.getDouble(i); // Use the column precision instead of 4? if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { // newInst.setValue(i - 1, bd.doubleValue()); vals[i - 1] = dd; } break; case BYTE: byte by = rs.getByte(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { vals[i - 1] = (double) by; } break; case SHORT: short sh = rs.getShort(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { vals[i - 1] = (double) sh; } break; case INTEGER: int in = rs.getInt(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { vals[i - 1] = (double) in; } break; case LONG: long lo = rs.getLong(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { vals[i - 1] = (double) lo; } break; case FLOAT: float fl = rs.getFloat(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { vals[i - 1] = (double) fl; } break; case DATE: Date date = rs.getDate(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { // TODO: Do a value check here. vals[i - 1] = (double) date.getTime(); } break; case TIME: Time time = rs.getTime(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { // TODO: Do a value check here. vals[i - 1] = (double) time.getTime(); } break; default: vals[i - 1] = Instance.missingValue(); } } Instance newInst; if (m_CreateSparseData) { newInst = new SparseInstance(1.0, vals); } else { newInst = new Instance(1.0, vals); } instances.addElement(newInst); rowCount++; } // disconnectFromDatabase(); (perhaps other queries might be made) // Create the header and add the instances to the dataset if (m_Debug) System.err.println("Creating header..."); FastVector attribInfo = new FastVector(); for (int i = 0; i < numAttributes; i++) { /* Fix for databases that uppercase column names */ // String attribName = attributeCaseFix(md.getColumnName(i + 1)); String attribName = attributeCaseFix(columnNames.get(i)); switch (attributeTypes[i]) { case Attribute.NOMINAL: attribInfo.addElement(new Attribute(attribName, nominalStrings[i])); break; case Attribute.NUMERIC: attribInfo.addElement(new Attribute(attribName)); break; case Attribute.STRING: Attribute att = new Attribute(attribName, (FastVector) null); attribInfo.addElement(att); for (int n = 0; n < nominalStrings[i].size(); n++) { att.addStringValue((String) nominalStrings[i].elementAt(n)); } break; case Attribute.DATE: attribInfo.addElement(new Attribute(attribName, (String) null)); break; default: throw new Exception("Unknown attribute type"); } } Instances result = new Instances("QueryResult", attribInfo, instances.size()); for (int i = 0; i < instances.size(); i++) { result.add((Instance) instances.elementAt(i)); } close(rs); return result; }
/** Builds the model from the files */ public void extractKeyphrases(Hashtable stems) throws Exception { Vector stats = new Vector(); // Check whether there is actually any data if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(2); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Extract keyphrases Enumeration elem = stems.keys(); while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); Reader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); Reader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("No keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (m_debug) { System.err.println("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == topRankedInstances[i] .attribute(topRankedInstances[i].numAttributes() - 1) .indexOfValue("True")) { numCorrect += 1.0; } if (printer != null) { printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print( Utils.doubleToString( topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { System.err.println(topRankedInstances[i]); } } } if (numExtracted > 0) { if (m_debug) { System.err.println("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); System.err.println( "Avg. number of correct keyphrases: " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); System.err.println("Based on " + stats.size() + " documents"); m_KEAFilter.batchFinished(); }
/** Builds the model from the training data */ public void buildModel(HashSet<String> fileNames) throws Exception { // Check whether there is actually any data if (fileNames.size() == 0) { throw new Exception("Couldn't find any data in " + inputDirectoryName); } System.err.println("-- Building the model... "); FastVector atts = new FastVector(3); atts.addElement(new Attribute("filename", (FastVector) null)); atts.addElement(new Attribute("document", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Build model mauiFilter = new MauiFilter(); mauiFilter.setDebug(getDebug()); mauiFilter.setMaxPhraseLength(getMaxPhraseLength()); mauiFilter.setMinPhraseLength(getMinPhraseLength()); mauiFilter.setMinNumOccur(getMinNumOccur()); mauiFilter.setStemmer(getStemmer()); mauiFilter.setDocumentLanguage(getDocumentLanguage()); mauiFilter.setVocabularyName(getVocabularyName()); mauiFilter.setVocabularyFormat(getVocabularyFormat()); mauiFilter.setStopwords(getStopwords()); if (wikipedia != null) { mauiFilter.setWikipedia(wikipedia); } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) { mauiFilter.setWikipedia(wikipedia); } else { mauiFilter.setWikipedia( wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory); } if (classifier != null) { mauiFilter.setClassifier(classifier); } mauiFilter.setInputFormat(data); // set features configurations mauiFilter.setBasicFeatures(useBasicFeatures); mauiFilter.setKeyphrasenessFeature(useKeyphrasenessFeature); mauiFilter.setFrequencyFeatures(useFrequencyFeatures); mauiFilter.setPositionsFeatures(usePositionsFeatures); mauiFilter.setLengthFeature(useLengthFeature); mauiFilter.setThesaurusFeatures(useNodeDegreeFeature); mauiFilter.setBasicWikipediaFeatures(useBasicWikipediaFeatures); mauiFilter.setAllWikipediaFeatures(useAllWikipediaFeatures); mauiFilter.setThesaurusFeatures(useNodeDegreeFeature); mauiFilter.setClassifier(classifier); mauiFilter.setContextSize(contextSize); mauiFilter.setMinKeyphraseness(minKeyphraseness); mauiFilter.setMinSenseProbability(minSenseProbability); if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) { mauiFilter.loadThesaurus(getStemmer(), getStopwords()); } System.err.println("-- Reading the input documents... "); for (String fileName : fileNames) { double[] newInst = new double[3]; newInst[0] = (double) data.attribute(0).addStringValue(fileName); ; File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt"); File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key"); try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTextFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTextFile)); } // Reading the file content StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } is.close(); // Adding the text of the document to the instance newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString()); } catch (Exception e) { System.err.println("Problem with reading " + documentTextFile); e.printStackTrace(); newInst[1] = Instance.missingValue(); } try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTopicsFile)); } // Reading the content of the keyphrase file StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } // Adding the topics to the file newInst[2] = (double) data.attribute(2).addStringValue(keyStr.toString()); } catch (Exception e) { System.err.println("Problem with reading " + documentTopicsFile); e.printStackTrace(); newInst[2] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); mauiFilter.input(data.instance(0)); data = data.stringFreeStructure(); } mauiFilter.batchFinished(); while ((mauiFilter.output()) != null) {} ; }
/** * ************************************************** Convert a table to a set of instances, with * <b>columns</b> representing individual </b>instances</b> and <b>rows</b> representing * <b>attributes</b> (e.g. as is common with microarray data) */ public Instances tableColsToInstances(Table t, String relationName) { System.err.print("Converting table cols to instances..."); // Set up attributes, which for colInstances will be the rowNames... FastVector atts = new FastVector(); ArrayList<Boolean> isNominal = new ArrayList<Boolean>(); ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later... System.err.print("creating attributes..."); for (int r = 0; r < t.numRows; r++) { if (rowIsNumeric(t, r)) { isNominal.add(false); atts.addElement(new Attribute(t.rowNames[r])); allAttVals.add(null); // No enumeration of attribute values. } else { // It's nominal... determine the range of values and create a nominal attribute... isNominal.add(true); FastVector attVals = getRowValues(t, r); atts.addElement(new Attribute(t.rowNames[r], attVals)); // Save it for later allAttVals.add(attVals); } } System.err.print("creating instances..."); // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); /** ***** CREATE INSTANCES ************* */ // Fill the instances with data... // For each instance... for (int c = 0; c < t.numCols; c++) { double[] vals = new double[data.numAttributes()]; // Even nominal values are stored as double pointers. // For each attribute fill in the numeric or attributeValue index... for (int r = 0; r < t.numRows; r++) { String val = (String) t.matrix.getQuick(r, c); if (val == "?") vals[r] = Instance.missingValue(); else if (isNominal.get(r)) { vals[r] = allAttVals.get(r).indexOf(val); } else { vals[r] = Double.parseDouble((String) val); } } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } System.err.print("add feature names..."); /** ***** ADD FEATURE NAMES ************* */ // takes basically zero time... all time is in previous 2 chunks. if (addInstanceNamesAsFeatures) { Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 // We save the instanceNames in a list because it's handy later on... instanceNames = new ArrayList<String>(); for (int c = 0; c < t.colNames.length; c++) { instanceNames.add(t.colNames[c]); newData.instance(c).setValue(attrIdx, t.colNames[c]); } data = newData; } System.err.println("done."); return (data); }
/** * ************************************************** Convert a table to a set of instances, with * <b>rows</b> representing individual </b>instances</b> and <b>columns</b> representing * <b>attributes</b> */ public Instances tableRowsToNominalInstances(Table t, String relationName) { System.err.print("Converting table rows to instances..."); // Set up attributes, which for rowInstances will be the colNames... FastVector atts = new FastVector(); ArrayList<Boolean> isNominal = new ArrayList<Boolean>(); ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later... System.err.print("creating attributes..."); for (int c = 0; c < t.numCols; c++) { // It's nominal... determine the range of values isNominal.add(true); FastVector attVals = getColValues(t, c); atts.addElement(new Attribute(t.colNames[c], attVals)); // Save it for later allAttVals.add(attVals); } System.err.print("creating instances..."); // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); // Fill the instances with data... // For each instance... for (int r = 0; r < t.numRows; r++) { double[] vals = new double[data.numAttributes()]; // for each attribute for (int c = 0; c < t.numCols; c++) { String val = (String) t.matrix.getQuick(r, c); if (val == "?") vals[c] = Instance.missingValue(); else if (isNominal.get(c)) { vals[c] = allAttVals.get(c).indexOf(val); } else { vals[c] = Double.parseDouble((String) val); } } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } System.err.print("add feature names..."); if (addInstanceNamesAsFeatures) { Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 // We save the instanceNames in a list because it's handy later on... instanceNames = new ArrayList<String>(); for (int r = 0; r < t.rowNames.length; r++) { instanceNames.add(t.rowNames[r]); newData.instance(r).setValue(attrIdx, t.rowNames[r]); } data = newData; } System.err.println("done."); return (data); }
/** * If we know in advance that the table is numeric, can optimize a lot... For example, on 9803 x * 294 table, TableFileLoader.readNumeric takes 6s compared to 12s for WekaMine readFromTable. */ public static Instances readNumeric(String fileName, String relationName, String delimiter) throws Exception { int numAttributes = FileUtils.fastCountLines(fileName) - 1; // -1 exclude heading. String[] attrNames = new String[numAttributes]; // Read the col headings and figure out the number of columns in the table.. BufferedReader reader = new BufferedReader(new FileReader(fileName), 4194304); String line = reader.readLine(); String[] instanceNames = parseColNames(line, delimiter); int numInstances = instanceNames.length; System.err.print("reading " + numAttributes + " x " + numInstances + " table.."); // Create an array to hold the data as we read it in... double dataArray[][] = new double[numAttributes][numInstances]; // Populate the matrix with values... String valToken = ""; try { int rowIdx = 0; while ((line = reader.readLine()) != null) { String[] tokens = line.split(delimiter, -1); attrNames[rowIdx] = tokens[0].trim(); for (int colIdx = 0; colIdx < (tokens.length - 1); colIdx++) { valToken = tokens[colIdx + 1]; double value; if (valToken.equals("null")) { value = Instance.missingValue(); } else if (valToken.equals("?")) { value = Instance.missingValue(); } else if (valToken.equals("NA")) { value = Instance.missingValue(); } else if (valToken.equals("")) { value = Instance.missingValue(); // }else value = DoubleParser.lightningParse(valToken); // faster double parser with // MANY assumptions } else value = Double.parseDouble(valToken); dataArray[rowIdx][colIdx] = value; } rowIdx++; } } catch (NumberFormatException e) { System.err.println(e.toString()); System.err.println("Parsing line: " + line); System.err.println("Parsing token: " + valToken); } // Set up attributes, which for colInstances will be the rowNames... FastVector atts = new FastVector(); for (int a = 0; a < numAttributes; a++) { atts.addElement(new Attribute(attrNames[a])); } // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); System.err.print("creating instances.."); // System.err.println("DEBUG: numAttributes "+numAttributes); /** ***** CREATE INSTANCES ************* */ // Fill the instances with data... // For each instance... for (int c = 0; c < numInstances; c++) { double[] vals = new double[data.numAttributes()]; // Even nominal values are stored as double pointers. for (int r = 0; r < numAttributes; r++) { double val = dataArray[r][c]; vals[r] = val; } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } // System.err.println("DEBUG: data.numInstances: "+data.numInstances()); // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes()); // System.err.println("DEBUG: data.relationNAme"+data.relationName()); System.err.print("add feature names.."); /** ***** ADD FEATURE NAMES ************* */ // takes basically zero time... all time is in previous 2 chunks. Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 for (int c = 0; c < numInstances; c++) { newData.instance(c).setValue(attrIdx, instanceNames[c]); } data = newData; // System.err.println("DEBUG: data.numInstances: "+data.numInstances()); // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes()); return (data); }