/** * Sorts the in-memory buffer * * @param write whether to write the sorted buffer to a temp file * @throws Exception if a problem occurs */ protected void sortBuffer(boolean write) throws Exception { String msg = statusMessagePrefix() + "Sorting in memory buffer...."; if (m_log != null) { m_log.statusMessage(msg); m_log.logMessage("[" + getCustomName() + "] " + msg); } Collections.sort(m_incrementalBuffer, m_sortComparator); if (!write) { return; } String tmpDir = m_tempDirectory; File tempFile = File.createTempFile("Sorter", ".tmp"); if (tmpDir != null && tmpDir.length() > 0) { try { tmpDir = m_env.substitute(tmpDir); File tempDir = new File(tmpDir); if (tempDir.exists() && tempDir.canWrite()) { String filename = tempFile.getName(); File newFile = new File(tmpDir + File.separator + filename); tempFile = newFile; tempFile.deleteOnExit(); } } catch (Exception ex) { } } if (!m_stopRequested.get()) { m_bufferFiles.add(tempFile); FileOutputStream fos = new FileOutputStream(tempFile); // GZIPOutputStream gzo = new GZIPOutputStream(fos); BufferedOutputStream bos = new BufferedOutputStream(fos, 50000); ObjectOutputStream oos = new ObjectOutputStream(bos); msg = statusMessagePrefix() + "Writing buffer to temp file " + m_bufferFiles.size() + "..."; if (m_log != null) { m_log.statusMessage(msg); m_log.logMessage("[" + getCustomName() + "] " + msg); } for (int i = 0; i < m_incrementalBuffer.size(); i++) { InstanceHolder temp = m_incrementalBuffer.get(i); temp.m_instance.setDataset(null); oos.writeObject(temp); if (i % (m_bufferSizeI / 10) == 0) { oos.reset(); } } bos.flush(); oos.close(); } m_incrementalBuffer.clear(); }
/** * Stops the step (and upstream ones) and then prints an error message and optional exception * message * * @param error the error message to print * @param ex the optional exception */ protected void stopWithErrorMessage(String error, Exception ex) { stop(); if (m_log != null) { m_log.statusMessage(statusMessagePrefix() + error + " - see log for details"); m_log.logMessage(statusMessagePrefix() + error + (ex != null ? " " + ex.getMessage() : "")); } }
/** * Accept and process a data set event * * @param e a <code>DataSetEvent</code> value */ @Override public void acceptDataSet(DataSetEvent e) { m_busy = true; m_stopRequested.set(false); if (m_log != null && e.getDataSet().numInstances() > 0) { m_log.statusMessage(statusMessagePrefix() + "Sorting batch..."); } if (e.isStructureOnly()) { // nothing to sort! // just notify listeners of structure DataSetEvent d = new DataSetEvent(this, e.getDataSet()); notifyDataListeners(d); m_busy = false; return; } try { init(new Instances(e.getDataSet(), 0)); } catch (IllegalArgumentException ex) { if (m_log != null) { String message = "ERROR: There is a problem with the incoming instance structure"; // m_log.statusMessage(statusMessagePrefix() + message // + " - see log for details"); // m_log.logMessage(statusMessagePrefix() + message + " :" // + ex.getMessage()); stopWithErrorMessage(message, ex); m_busy = false; return; } } List<InstanceHolder> instances = new ArrayList<InstanceHolder>(); for (int i = 0; i < e.getDataSet().numInstances(); i++) { InstanceHolder h = new InstanceHolder(); h.m_instance = e.getDataSet().instance(i); instances.add(h); } Collections.sort(instances, m_sortComparator); Instances output = new Instances(e.getDataSet(), 0); for (int i = 0; i < instances.size(); i++) { output.add(instances.get(i).m_instance); } DataSetEvent d = new DataSetEvent(this, output); notifyDataListeners(d); if (m_log != null) { m_log.statusMessage(statusMessagePrefix() + "Finished."); } m_busy = false; }
@Override public void acceptDataSet(DataSetEvent e) { m_busy = true; if (m_log != null && !e.isStructureOnly()) { m_log.statusMessage(statusMessagePrefix() + "Processing batch..."); } init(new Instances(e.getDataSet(), 0)); if (m_root != null) { Instances trueBatch = new Instances(e.getDataSet(), 0); Instances falseBatch = new Instances(e.getDataSet(), 0); for (int i = 0; i < e.getDataSet().numInstances(); i++) { Instance current = e.getDataSet().instance(i); boolean result = m_root.evaluate(current, true); if (result) { if (m_indexOfTrueStep >= 0) { trueBatch.add(current); } } else { if (m_indexOfFalseStep >= 0) { falseBatch.add(current); } } } if (m_indexOfTrueStep >= 0) { DataSetEvent d = new DataSetEvent(this, trueBatch); ((DataSourceListener) m_downstream[m_indexOfTrueStep]).acceptDataSet(d); } if (m_indexOfFalseStep >= 0) { DataSetEvent d = new DataSetEvent(this, falseBatch); ((DataSourceListener) m_downstream[m_indexOfFalseStep]).acceptDataSet(d); } } else { if (m_indexOfTrueStep >= 0) { DataSetEvent d = new DataSetEvent(this, e.getDataSet()); ((DataSourceListener) m_downstream[m_indexOfTrueStep]).acceptDataSet(d); } } if (m_log != null && !e.isStructureOnly()) { m_log.statusMessage(statusMessagePrefix() + "Finished"); } m_busy = false; }
/** * Convert an <code>Instance</code> to an array of values that matches the format of the mining * schema. First maps raw attribute values and then applies rules for missing values, outliers * etc. * * @param inst the <code>Instance</code> to convert * @param miningSchema the mining schema incoming instance attributes * @return an array of doubles that are values from the incoming Instances, correspond to the * format of the mining schema and have had missing values, outliers etc. dealt with. * @throws Exception if something goes wrong */ public double[] instanceToSchema(Instance inst, MiningSchema miningSchema) throws Exception { Instances miningSchemaI = miningSchema.getMiningSchemaAsInstances(); // allocate enough space for both mining schema fields and any derived fields double[] result = new double[miningSchema.getFieldsAsInstances().numAttributes()]; // Copy over the values for (int i = 0; i < miningSchemaI.numAttributes(); i++) { // if (miningSchemaI.attribute(i).isNumeric()) { result[i] = inst.value(m_fieldsMap[i]); if (miningSchemaI.attribute(i).isNominal() || miningSchemaI.attribute(i).isString()) { // If not missing, look up the index of this incoming categorical value in // the mining schema if (!Utils.isMissingValue(inst.value(m_fieldsMap[i]))) { int[] valueMap = m_nominalValueMaps[i]; int index = valueMap[(int) inst.value(m_fieldsMap[i])]; String incomingAttValue = inst.attribute(m_fieldsMap[i]).value((int) inst.value(m_fieldsMap[i])); /*int index = miningSchemaI.attribute(i).indexOfValue(incomingAttValue); */ if (index >= 0) { result[i] = index; } else { // set this to "unknown" (-1) for nominal valued attributes result[i] = UNKNOWN_NOMINAL_VALUE; String warningString = "[MappingInfo] WARNING: Can't match nominal value " + incomingAttValue; if (m_log != null) { m_log.logMessage(warningString); } else { System.err.println(warningString); } } } } } // Now deal with missing values and outliers... miningSchema.applyMissingAndOutlierTreatments(result); // printInst(result); // now fill in any derived values ArrayList<DerivedFieldMetaInfo> derivedFields = miningSchema.getDerivedFields(); for (int i = 0; i < derivedFields.size(); i++) { DerivedFieldMetaInfo temp = derivedFields.get(i); // System.err.println("Applying : " + temp); double r = temp.getDerivedValue(result); result[i + miningSchemaI.numAttributes()] = r; } /*System.err.print("==> "); for (int i = 0; i < result.length; i++) { System.err.print(" " + result[i]); } System.err.println();*/ return result; }
@Override public void stop() { if (m_listenee != null) { if (m_listenee instanceof BeanCommon) { ((BeanCommon) m_listenee).stop(); } } if (m_log != null) { m_log.statusMessage(statusMessagePrefix() + "Stopped"); } m_busy = false; }
@Override public void acceptInstance(InstanceEvent e) { m_busy = true; if (e.getStatus() == InstanceEvent.FORMAT_AVAILABLE) { Instances structure = e.getStructure(); init(structure); if (m_log != null) { m_log.statusMessage(statusMessagePrefix() + "Processing stream..."); } // notify listeners of structure m_ie.setStructure(structure); if (m_indexOfTrueStep >= 0) { ((InstanceListener) m_downstream[m_indexOfTrueStep]).acceptInstance(m_ie); } if (m_indexOfFalseStep >= 0) { ((InstanceListener) m_downstream[m_indexOfFalseStep]).acceptInstance(m_ie); } } else { Instance inst = e.getInstance(); m_ie.setStatus(e.getStatus()); if (inst == null || e.getStatus() == InstanceEvent.BATCH_FINISHED) { if (inst != null) { // evaluate and notify boolean result = true; if (m_root != null) { result = m_root.evaluate(inst, true); } if (result) { if (m_indexOfTrueStep >= 0) { m_ie.setInstance(inst); ((InstanceListener) m_downstream[m_indexOfTrueStep]).acceptInstance(m_ie); } if (m_indexOfFalseStep >= 0) { m_ie.setInstance(null); ((InstanceListener) m_downstream[m_indexOfFalseStep]).acceptInstance(m_ie); } } else { if (m_indexOfFalseStep >= 0) { m_ie.setInstance(inst); ((InstanceListener) m_downstream[m_indexOfFalseStep]).acceptInstance(m_ie); } if (m_indexOfTrueStep >= 0) { m_ie.setInstance(null); ((InstanceListener) m_downstream[m_indexOfTrueStep]).acceptInstance(m_ie); } } } else { // notify both of end of stream m_ie.setInstance(null); if (m_indexOfTrueStep >= 0) { ((InstanceListener) m_downstream[m_indexOfTrueStep]).acceptInstance(m_ie); } if (m_indexOfFalseStep >= 0) { ((InstanceListener) m_downstream[m_indexOfFalseStep]).acceptInstance(m_ie); } } if (m_log != null) { m_log.statusMessage(statusMessagePrefix() + "Finished"); } } else { boolean result = true; if (m_root != null) { result = m_root.evaluate(inst, true); } m_ie.setInstance(inst); if (result) { if (m_indexOfTrueStep >= 0) { ((InstanceListener) m_downstream[m_indexOfTrueStep]).acceptInstance(m_ie); } } else { if (m_indexOfFalseStep >= 0) { ((InstanceListener) m_downstream[m_indexOfFalseStep]).acceptInstance(m_ie); } } } } m_busy = false; }
public MappingInfo(Instances dataSet, MiningSchema miningSchema, Logger log) throws Exception { m_log = log; // miningSchema.convertStringAttsToNominal(); Instances fieldsI = miningSchema.getMiningSchemaAsInstances(); m_fieldsMap = new int[fieldsI.numAttributes()]; m_nominalValueMaps = new int[fieldsI.numAttributes()][]; for (int i = 0; i < fieldsI.numAttributes(); i++) { String schemaAttName = fieldsI.attribute(i).name(); boolean found = false; for (int j = 0; j < dataSet.numAttributes(); j++) { if (dataSet.attribute(j).name().equals(schemaAttName)) { Attribute miningSchemaAtt = fieldsI.attribute(i); Attribute incomingAtt = dataSet.attribute(j); // check type match if (miningSchemaAtt.type() != incomingAtt.type()) { throw new Exception( "[MappingInfo] type mismatch for field " + schemaAttName + ". Mining schema type " + miningSchemaAtt.toString() + ". Incoming type " + incomingAtt.toString() + "."); } // check nominal values (number, names...) if (miningSchemaAtt.numValues() != incomingAtt.numValues()) { String warningString = "[MappingInfo] WARNING: incoming nominal attribute " + incomingAtt.name() + " does not have the same " + "number of values as the corresponding mining " + "schema attribute."; if (m_log != null) { m_log.logMessage(warningString); } else { System.err.println(warningString); } } if (miningSchemaAtt.isNominal() || miningSchemaAtt.isString()) { int[] valuesMap = new int[incomingAtt.numValues()]; for (int k = 0; k < incomingAtt.numValues(); k++) { String incomingNomVal = incomingAtt.value(k); int indexInSchema = miningSchemaAtt.indexOfValue(incomingNomVal); if (indexInSchema < 0) { String warningString = "[MappingInfo] WARNING: incoming nominal attribute " + incomingAtt.name() + " has value " + incomingNomVal + " that doesn't occur in the mining schema."; if (m_log != null) { m_log.logMessage(warningString); } else { System.err.println(warningString); } valuesMap[k] = UNKNOWN_NOMINAL_VALUE; } else { valuesMap[k] = indexInSchema; } } m_nominalValueMaps[i] = valuesMap; } /*if (miningSchemaAtt.isNominal()) { for (int k = 0; k < miningSchemaAtt.numValues(); k++) { if (!miningSchemaAtt.value(k).equals(incomingAtt.value(k))) { throw new Exception("[PMMLUtils] value " + k + " (" + miningSchemaAtt.value(k) + ") does not match " + "incoming value (" + incomingAtt.value(k) + ") for attribute " + miningSchemaAtt.name() + "."); } } }*/ found = true; m_fieldsMap[i] = j; } } if (!found) { throw new Exception( "[MappingInfo] Unable to find a match for mining schema " + "attribute " + schemaAttName + " in the " + "incoming instances!"); } } // check class attribute (if set) if (fieldsI.classIndex() >= 0) { if (dataSet.classIndex() < 0) { // first see if we can find a matching class String className = fieldsI.classAttribute().name(); Attribute classMatch = dataSet.attribute(className); if (classMatch == null) { throw new Exception( "[MappingInfo] Can't find match for target field " + className + "in incoming instances!"); } dataSet.setClass(classMatch); } else if (!fieldsI.classAttribute().name().equals(dataSet.classAttribute().name())) { throw new Exception( "[MappingInfo] class attribute in mining schema does not match " + "class attribute in incoming instances!"); } } // Set up the textual description of the mapping fieldsMappingString(fieldsI, dataSet); }
/** * Accept and process an instance event * * @param e an <code>InstanceEvent</code> value */ @Override public void acceptInstance(InstanceEvent e) { if (e.getStatus() == InstanceEvent.FORMAT_AVAILABLE) { m_connectedFormat = e.getStructure(); m_stopRequested.set(false); try { init(new Instances(e.getStructure(), 0)); } catch (IllegalArgumentException ex) { if (m_log != null) { String message = "ERROR: There is a problem with the incoming instance structure"; // m_log.statusMessage(statusMessagePrefix() + message // + " - see log for details"); // m_log.logMessage(statusMessagePrefix() + message + " :" // + ex.getMessage()); stopWithErrorMessage(message, ex); // m_busy = false; return; } } String buffSize = m_bufferSize; try { buffSize = m_env.substitute(buffSize); m_bufferSizeI = Integer.parseInt(buffSize); } catch (Exception ex) { ex.printStackTrace(); } m_incrementalBuffer = new ArrayList<InstanceHolder>(m_bufferSizeI); m_bufferFiles = new ArrayList<File>(); m_streamCounter = 0; return; } m_busy = true; if (e.getInstance() != null) { if (m_streamCounter == 0) { if (m_log != null) { m_log.statusMessage(statusMessagePrefix() + "Starting streaming sort..."); m_log.logMessage( "[Sorter] " + statusMessagePrefix() + " Using streaming buffer size: " + m_bufferSizeI); } } InstanceHolder tempH = new InstanceHolder(); tempH.m_instance = e.getInstance(); tempH.m_fileNumber = -1; // unused here if (m_stringAttIndexes != null) { copyStringAttVals(tempH); } m_incrementalBuffer.add(tempH); m_streamCounter++; } if (e.getInstance() == null || e.getStatus() == InstanceEvent.BATCH_FINISHED) { emitBufferedInstances(); // thread will set busy to false and report done status when // complete return; } else if (m_incrementalBuffer.size() == m_bufferSizeI) { // time to sort and write this to a temp file try { sortBuffer(true); } catch (Exception ex) { String msg = statusMessagePrefix() + "ERROR: unable to write to temp file."; // if (m_log != null) { // m_log.statusMessage(msg); // m_log.logMessage("[" + getCustomName() + "] " + msg); // } stopWithErrorMessage(msg, ex); // ex.printStackTrace(); m_busy = false; return; } } m_busy = false; }