예제 #1
0
  /**
   * Sorts the in-memory buffer
   *
   * @param write whether to write the sorted buffer to a temp file
   * @throws Exception if a problem occurs
   */
  protected void sortBuffer(boolean write) throws Exception {

    String msg = statusMessagePrefix() + "Sorting in memory buffer....";
    if (m_log != null) {
      m_log.statusMessage(msg);
      m_log.logMessage("[" + getCustomName() + "] " + msg);
    }

    Collections.sort(m_incrementalBuffer, m_sortComparator);

    if (!write) {
      return;
    }

    String tmpDir = m_tempDirectory;
    File tempFile = File.createTempFile("Sorter", ".tmp");

    if (tmpDir != null && tmpDir.length() > 0) {
      try {
        tmpDir = m_env.substitute(tmpDir);

        File tempDir = new File(tmpDir);
        if (tempDir.exists() && tempDir.canWrite()) {
          String filename = tempFile.getName();
          File newFile = new File(tmpDir + File.separator + filename);
          tempFile = newFile;
          tempFile.deleteOnExit();
        }
      } catch (Exception ex) {
      }
    }

    if (!m_stopRequested.get()) {

      m_bufferFiles.add(tempFile);
      FileOutputStream fos = new FileOutputStream(tempFile);
      // GZIPOutputStream gzo = new GZIPOutputStream(fos);
      BufferedOutputStream bos = new BufferedOutputStream(fos, 50000);
      ObjectOutputStream oos = new ObjectOutputStream(bos);

      msg = statusMessagePrefix() + "Writing buffer to temp file " + m_bufferFiles.size() + "...";
      if (m_log != null) {
        m_log.statusMessage(msg);
        m_log.logMessage("[" + getCustomName() + "] " + msg);
      }

      for (int i = 0; i < m_incrementalBuffer.size(); i++) {
        InstanceHolder temp = m_incrementalBuffer.get(i);
        temp.m_instance.setDataset(null);
        oos.writeObject(temp);
        if (i % (m_bufferSizeI / 10) == 0) {
          oos.reset();
        }
      }

      bos.flush();
      oos.close();
    }
    m_incrementalBuffer.clear();
  }
예제 #2
0
 /**
  * Stops the step (and upstream ones) and then prints an error message and optional exception
  * message
  *
  * @param error the error message to print
  * @param ex the optional exception
  */
 protected void stopWithErrorMessage(String error, Exception ex) {
   stop();
   if (m_log != null) {
     m_log.statusMessage(statusMessagePrefix() + error + " - see log for details");
     m_log.logMessage(statusMessagePrefix() + error + (ex != null ? " " + ex.getMessage() : ""));
   }
 }
예제 #3
0
  /**
   * Convert an <code>Instance</code> to an array of values that matches the format of the mining
   * schema. First maps raw attribute values and then applies rules for missing values, outliers
   * etc.
   *
   * @param inst the <code>Instance</code> to convert
   * @param miningSchema the mining schema incoming instance attributes
   * @return an array of doubles that are values from the incoming Instances, correspond to the
   *     format of the mining schema and have had missing values, outliers etc. dealt with.
   * @throws Exception if something goes wrong
   */
  public double[] instanceToSchema(Instance inst, MiningSchema miningSchema) throws Exception {
    Instances miningSchemaI = miningSchema.getMiningSchemaAsInstances();

    // allocate enough space for both mining schema fields and any derived fields
    double[] result = new double[miningSchema.getFieldsAsInstances().numAttributes()];

    // Copy over the values
    for (int i = 0; i < miningSchemaI.numAttributes(); i++) {
      // if (miningSchemaI.attribute(i).isNumeric()) {
      result[i] = inst.value(m_fieldsMap[i]);
      if (miningSchemaI.attribute(i).isNominal() || miningSchemaI.attribute(i).isString()) {
        // If not missing, look up the index of this incoming categorical value in
        // the mining schema
        if (!Utils.isMissingValue(inst.value(m_fieldsMap[i]))) {
          int[] valueMap = m_nominalValueMaps[i];
          int index = valueMap[(int) inst.value(m_fieldsMap[i])];
          String incomingAttValue =
              inst.attribute(m_fieldsMap[i]).value((int) inst.value(m_fieldsMap[i]));
          /*int index = miningSchemaI.attribute(i).indexOfValue(incomingAttValue); */
          if (index >= 0) {
            result[i] = index;
          } else {
            // set this to "unknown" (-1) for nominal valued attributes
            result[i] = UNKNOWN_NOMINAL_VALUE;
            String warningString =
                "[MappingInfo] WARNING: Can't match nominal value " + incomingAttValue;
            if (m_log != null) {
              m_log.logMessage(warningString);
            } else {
              System.err.println(warningString);
            }
          }
        }
      }
    }

    // Now deal with missing values and outliers...
    miningSchema.applyMissingAndOutlierTreatments(result);
    //    printInst(result);

    // now fill in any derived values
    ArrayList<DerivedFieldMetaInfo> derivedFields = miningSchema.getDerivedFields();
    for (int i = 0; i < derivedFields.size(); i++) {
      DerivedFieldMetaInfo temp = derivedFields.get(i);
      //      System.err.println("Applying : " + temp);
      double r = temp.getDerivedValue(result);
      result[i + miningSchemaI.numAttributes()] = r;
    }

    /*System.err.print("==> ");
    for (int i = 0; i < result.length; i++) {
      System.err.print(" " + result[i]);
    }
    System.err.println();*/

    return result;
  }
예제 #4
0
  public MappingInfo(Instances dataSet, MiningSchema miningSchema, Logger log) throws Exception {
    m_log = log;
    // miningSchema.convertStringAttsToNominal();
    Instances fieldsI = miningSchema.getMiningSchemaAsInstances();

    m_fieldsMap = new int[fieldsI.numAttributes()];
    m_nominalValueMaps = new int[fieldsI.numAttributes()][];

    for (int i = 0; i < fieldsI.numAttributes(); i++) {
      String schemaAttName = fieldsI.attribute(i).name();
      boolean found = false;
      for (int j = 0; j < dataSet.numAttributes(); j++) {
        if (dataSet.attribute(j).name().equals(schemaAttName)) {
          Attribute miningSchemaAtt = fieldsI.attribute(i);
          Attribute incomingAtt = dataSet.attribute(j);
          // check type match
          if (miningSchemaAtt.type() != incomingAtt.type()) {
            throw new Exception(
                "[MappingInfo] type mismatch for field "
                    + schemaAttName
                    + ". Mining schema type "
                    + miningSchemaAtt.toString()
                    + ". Incoming type "
                    + incomingAtt.toString()
                    + ".");
          }

          // check nominal values (number, names...)
          if (miningSchemaAtt.numValues() != incomingAtt.numValues()) {
            String warningString =
                "[MappingInfo] WARNING: incoming nominal attribute "
                    + incomingAtt.name()
                    + " does not have the same "
                    + "number of values as the corresponding mining "
                    + "schema attribute.";
            if (m_log != null) {
              m_log.logMessage(warningString);
            } else {
              System.err.println(warningString);
            }
          }
          if (miningSchemaAtt.isNominal() || miningSchemaAtt.isString()) {
            int[] valuesMap = new int[incomingAtt.numValues()];
            for (int k = 0; k < incomingAtt.numValues(); k++) {
              String incomingNomVal = incomingAtt.value(k);
              int indexInSchema = miningSchemaAtt.indexOfValue(incomingNomVal);
              if (indexInSchema < 0) {
                String warningString =
                    "[MappingInfo] WARNING: incoming nominal attribute "
                        + incomingAtt.name()
                        + " has value "
                        + incomingNomVal
                        + " that doesn't occur in the mining schema.";
                if (m_log != null) {
                  m_log.logMessage(warningString);
                } else {
                  System.err.println(warningString);
                }
                valuesMap[k] = UNKNOWN_NOMINAL_VALUE;
              } else {
                valuesMap[k] = indexInSchema;
              }
            }
            m_nominalValueMaps[i] = valuesMap;
          }

          /*if (miningSchemaAtt.isNominal()) {
            for (int k = 0; k < miningSchemaAtt.numValues(); k++) {
              if (!miningSchemaAtt.value(k).equals(incomingAtt.value(k))) {
                throw new Exception("[PMMLUtils] value " + k + " (" +
                                    miningSchemaAtt.value(k) + ") does not match " +
                                    "incoming value (" + incomingAtt.value(k) +
                                    ") for attribute " + miningSchemaAtt.name() +
                                    ".");

              }
            }
          }*/
          found = true;
          m_fieldsMap[i] = j;
        }
      }
      if (!found) {
        throw new Exception(
            "[MappingInfo] Unable to find a match for mining schema "
                + "attribute "
                + schemaAttName
                + " in the "
                + "incoming instances!");
      }
    }

    // check class attribute (if set)
    if (fieldsI.classIndex() >= 0) {
      if (dataSet.classIndex() < 0) {
        // first see if we can find a matching class
        String className = fieldsI.classAttribute().name();
        Attribute classMatch = dataSet.attribute(className);
        if (classMatch == null) {
          throw new Exception(
              "[MappingInfo] Can't find match for target field "
                  + className
                  + "in incoming instances!");
        }
        dataSet.setClass(classMatch);
      } else if (!fieldsI.classAttribute().name().equals(dataSet.classAttribute().name())) {
        throw new Exception(
            "[MappingInfo] class attribute in mining schema does not match "
                + "class attribute in incoming instances!");
      }
    }

    // Set up the textual description of the mapping
    fieldsMappingString(fieldsI, dataSet);
  }
예제 #5
0
  /**
   * Accept and process an instance event
   *
   * @param e an <code>InstanceEvent</code> value
   */
  @Override
  public void acceptInstance(InstanceEvent e) {

    if (e.getStatus() == InstanceEvent.FORMAT_AVAILABLE) {
      m_connectedFormat = e.getStructure();
      m_stopRequested.set(false);
      try {
        init(new Instances(e.getStructure(), 0));
      } catch (IllegalArgumentException ex) {
        if (m_log != null) {
          String message = "ERROR: There is a problem with the incoming instance structure";

          // m_log.statusMessage(statusMessagePrefix() + message
          // + " - see log for details");
          // m_log.logMessage(statusMessagePrefix() + message + " :"
          // + ex.getMessage());

          stopWithErrorMessage(message, ex);
          // m_busy = false;
          return;
        }
      }

      String buffSize = m_bufferSize;
      try {
        buffSize = m_env.substitute(buffSize);
        m_bufferSizeI = Integer.parseInt(buffSize);
      } catch (Exception ex) {
        ex.printStackTrace();
      }
      m_incrementalBuffer = new ArrayList<InstanceHolder>(m_bufferSizeI);
      m_bufferFiles = new ArrayList<File>();
      m_streamCounter = 0;

      return;
    }

    m_busy = true;

    if (e.getInstance() != null) {
      if (m_streamCounter == 0) {
        if (m_log != null) {
          m_log.statusMessage(statusMessagePrefix() + "Starting streaming sort...");
          m_log.logMessage(
              "[Sorter] "
                  + statusMessagePrefix()
                  + " Using streaming buffer size: "
                  + m_bufferSizeI);
        }
      }

      InstanceHolder tempH = new InstanceHolder();
      tempH.m_instance = e.getInstance();
      tempH.m_fileNumber = -1; // unused here
      if (m_stringAttIndexes != null) {
        copyStringAttVals(tempH);
      }
      m_incrementalBuffer.add(tempH);
      m_streamCounter++;
    }

    if (e.getInstance() == null || e.getStatus() == InstanceEvent.BATCH_FINISHED) {
      emitBufferedInstances();
      // thread will set busy to false and report done status when
      // complete
      return;
    } else if (m_incrementalBuffer.size() == m_bufferSizeI) {
      // time to sort and write this to a temp file
      try {
        sortBuffer(true);
      } catch (Exception ex) {
        String msg = statusMessagePrefix() + "ERROR: unable to write to temp file.";
        // if (m_log != null) {
        // m_log.statusMessage(msg);
        // m_log.logMessage("[" + getCustomName() + "] " + msg);
        // }
        stopWithErrorMessage(msg, ex);

        // ex.printStackTrace();
        m_busy = false;
        return;
      }
    }

    m_busy = false;
  }