예제 #1
0
 /**
  * Takes string values referenced by an Instance and copies them from a source dataset to a
  * destination dataset. The instance references are updated to be valid for the destination
  * dataset. The instance may have the structure (i.e. number and attribute position) of either
  * dataset (this affects where references are obtained from). Only works if the number of string
  * attributes is the same in both indices (implicitly these string attributes should be
  * semantically same but just with shifted positions).
  *
  * @param instance the instance containing references to strings in the source dataset that will
  *     have references updated to be valid for the destination dataset.
  * @param instSrcCompat true if the instance structure is the same as the source, or false if it
  *     is the same as the destination (i.e. which of the string attribute indices contains the
  *     correct locations for this instance).
  * @param srcDataset the dataset for which the current instance string references are valid (after
  *     any position mapping if needed)
  * @param srcStrAtts an array containing the indices of string attributes in the source datset.
  * @param destDataset the dataset for which the current instance string references need to be
  *     inserted (after any position mapping if needed)
  * @param destStrAtts an array containing the indices of string attributes in the destination
  *     datset.
  */
 protected void copyStringValues(
     M5Instance instance,
     boolean instSrcCompat,
     M5Instances srcDataset,
     int[] srcStrAtts,
     M5Instances destDataset,
     int[] destStrAtts) {
   if (srcDataset == destDataset) {
     return;
   }
   if (srcStrAtts.length != destStrAtts.length) {
     throw new IllegalArgumentException("Src and Dest string indices differ in length!!");
   }
   for (int i = 0; i < srcStrAtts.length; i++) {
     int instIndex = instSrcCompat ? srcStrAtts[i] : destStrAtts[i];
     M5Attribute src = srcDataset.attribute(srcStrAtts[i]);
     M5Attribute dest = destDataset.attribute(destStrAtts[i]);
     if (!instance.isMissing(instIndex)) {
       // System.err.println(instance.value(srcIndex)
       //                   + " " + src.numValues()
       //                   + " " + dest.numValues());
       int valIndex = dest.addStringValue(src, (int) instance.value(instIndex));
       // setValue here shouldn't be too slow here unless your dataset has
       // squillions of string attributes
       instance.setValue(instIndex, (double) valIndex);
     }
   }
 }
예제 #2
0
  /**
   * This will remove all buffered instances from the inputformat dataset. Use this method rather
   * than getInputFormat().delete();
   */
  protected void flushInput() {

    if (m_InputStringAtts.length > 0) {
      m_InputFormat = m_InputFormat.stringFreeStructure();
    } else {
      // This more efficient than new Instances(m_InputFormat, 0);
      m_InputFormat.delete();
    }
  }
예제 #3
0
  /**
   * Sets the format of the input instances.
   *
   * @param instanceInfo an Instances object containing the input instance structure (any instances
   *     contained in the object are ignored - only the structure is required).
   * @return true if the outputFormat may be collected immediately
   * @exception Exception if the input format can't be set successfully
   */
  public boolean setInputFormat(M5Instances instanceInfo) throws Exception {

    superSetInputFormat(instanceInfo);
    if (instanceInfo.classIndex() < 0) {
      throw new Exception("No class has been assigned to the instances");
    }
    setOutputFormat();
    m_Indices = null;
    if (instanceInfo.classAttribute().isNominal()) {
      return true;
    } else {
      return false;
    }
  }
예제 #4
0
  /**
   * Gets an array containing the indices of all string attributes.
   *
   * @param insts the Instances to scan for string attributes.
   * @return an array containing the indices of string attributes in the input structure. Will be
   *     zero-length if there are no string attributes
   */
  protected int[] getStringIndices(M5Instances insts) {

    // Scan through getting the indices of String attributes
    int[] index = new int[insts.numAttributes()];
    int indexSize = 0;
    for (int i = 0; i < insts.numAttributes(); i++) {
      if (insts.attribute(i).type() == M5Attribute.STRING) {
        index[indexSize++] = i;
      }
    }
    int[] result = new int[indexSize];
    System.arraycopy(index, 0, result, 0, indexSize);
    return result;
  }
예제 #5
0
  /** Set the output format if the class is numeric. */
  private void setOutputFormatNumeric() {

    if (m_Indices == null) {
      setOutputFormat(null);
      return;
    }
    M5Vector newAtts;
    int newClassIndex;
    StringBuffer attributeName;
    M5Instances outputFormat;
    M5Vector vals;

    // Compute new attributes

    newClassIndex = getInputFormat().classIndex();
    newAtts = new M5Vector();
    for (int j = 0; j < getInputFormat().numAttributes(); j++) {
      M5Attribute att = getInputFormat().attribute(j);
      if ((!att.isNominal()) || (j == getInputFormat().classIndex())) {
        newAtts.addElement(att.copy());
      } else {
        if (j < getInputFormat().classIndex()) {
          newClassIndex += att.numValues() - 2;
        }

        // Compute values for new attributes

        for (int k = 1; k < att.numValues(); k++) {
          attributeName = new StringBuffer(att.name() + "=");
          for (int l = k; l < att.numValues(); l++) {
            if (l > k) {
              attributeName.append(',');
            }
            attributeName.append(att.value(m_Indices[j][l]));
          }
          if (m_Numeric) {
            newAtts.addElement(new M5Attribute(attributeName.toString()));
          } else {
            vals = new M5Vector(2);
            vals.addElement("f");
            vals.addElement("t");
            newAtts.addElement(new M5Attribute(attributeName.toString(), vals));
          }
        }
      }
    }
    outputFormat = new M5Instances(getInputFormat().relationName(), newAtts, 0);
    outputFormat.setClassIndex(newClassIndex);
    setOutputFormat(outputFormat);
  }
예제 #6
0
  /**
   * Adds the supplied input instance to the inputformat dataset for later processing. Use this
   * method rather than getInputFormat().add(instance). Or else.
   *
   * @param instance the <code>Instance</code> to buffer.
   */
  protected void bufferInput(M5Instance instance) {

    if (instance != null) {
      copyStringValues(instance, m_InputFormat, m_InputStringAtts);
      instance.setDataset(m_InputFormat);
      m_InputFormat.add(instance);
    }
  }
예제 #7
0
  public boolean superSetInputFormat(M5Instances instanceInfo) throws Exception {

    m_InputFormat = instanceInfo.stringFreeStructure();
    m_InputStringAtts = getStringIndices(instanceInfo);
    m_OutputFormat = null;
    m_OutputQueue = new Queue();
    m_NewBatch = true;
    return false;
  }
예제 #8
0
  /**
   * Sets the format of output instances. The derived class should use this method once it has
   * determined the outputformat. The output queue is cleared.
   *
   * @param outputFormat the new output format
   */
  protected void setOutputFormat(M5Instances outputFormat) {

    if (outputFormat != null) {
      m_OutputFormat = outputFormat.stringFreeStructure();
      m_OutputStringAtts = getStringIndices(m_OutputFormat);

      // Rename the attribute
      String relationName = outputFormat.relationName() + "-" + this.getClass().getName();
      if (this instanceof NominalToBinaryFilter) {
        String[] options = ((NominalToBinaryFilter) this).getOptions();
        for (int i = 0; i < options.length; i++) {
          relationName += options[i].trim();
        }
      }
      m_OutputFormat.setRelationName(relationName);
    } else {
      m_OutputFormat = null;
    }
    m_OutputQueue = new Queue();
  }
예제 #9
0
  /**
   * Copies string values contained in the instance copied to a new dataset. The Instance must
   * already be assigned to a dataset. This dataset and the destination dataset must have the same
   * structure.
   *
   * @param instance the Instance containing the string values to copy.
   * @param destDataset the destination set of Instances
   * @param strAtts an array containing the indices of any string attributes in the dataset.
   */
  private void copyStringValues(M5Instance inst, M5Instances destDataset, int[] strAtts) {

    if (strAtts.length == 0) {
      return;
    }
    if (inst.dataset() == null) {
      throw new IllegalArgumentException("Instance has no dataset assigned!!");
    } else if (inst.dataset().numAttributes() != destDataset.numAttributes()) {
      throw new IllegalArgumentException("Src and Dest differ in # of attributes!!");
    }
    copyStringValues(inst, true, inst.dataset(), strAtts, destDataset, strAtts);
  }
예제 #10
0
  /**
   * Filters an entire set of instances through a filter and returns the new set.
   *
   * @param data the data to be filtered
   * @param filter the filter to be used
   * @return the filtered set of data
   * @exception Exception if the filter can't be used successfully
   */
  public static M5Instances useFilter(M5Instances data, NominalToBinaryFilter filter)
      throws Exception {
    /*
        System.err.println(filter.getClass().getName()
                      + " in:" + data.numInstances());
    */
    for (int i = 0; i < data.numInstances(); i++) {
      filter.input(data.instance(i));
    }
    filter.batchFinished();
    M5Instances newData = filter.getOutputFormat();
    M5Instance processed;
    while ((processed = filter.output()) != null) {
      newData.add(processed);
    }

    /*
        System.err.println(filter.getClass().getName()
                      + " out:" + newData.numInstances());
    */
    return newData;
  }
예제 #11
0
  /**
   * Output an instance after filtering and remove from the output queue.
   *
   * @return the instance that has most recently been filtered (or null if the queue is empty).
   * @exception NullPointerException if no output structure has been defined
   */
  public M5Instance output() {

    if (m_OutputFormat == null) {
      throw new NullPointerException("No output instance format defined");
    }
    if (m_OutputQueue.empty()) {
      return null;
    }
    M5Instance result = (M5Instance) m_OutputQueue.pop();
    // Clear out references to old strings occasionally
    if (m_OutputQueue.empty() && m_NewBatch) {
      if (m_OutputStringAtts.length > 0) {
        m_OutputFormat = m_OutputFormat.stringFreeStructure();
      }
    }
    return result;
  }
예제 #12
0
  /**
   * Method for testing filters ability to process multiple batches.
   *
   * @param options should contain the following arguments:<br>
   *     -i (first) input file <br>
   *     -o (first) output file <br>
   *     -r (second) input file <br>
   *     -s (second) output file <br>
   *     -c class_index <br>
   *     or -h for help on options
   * @exception Exception if something goes wrong or the user requests help on command options
   */
  public static void batchFilterFile(NominalToBinaryFilter filter, String[] options)
      throws Exception {

    M5Instances firstData = null;
    M5Instances secondData = null;
    Reader firstInput = null;
    Reader secondInput = null;
    PrintWriter firstOutput = null;
    PrintWriter secondOutput = null;
    boolean helpRequest;
    try {
      helpRequest = M5StaticUtils.getFlag('h', options);

      String fileName = M5StaticUtils.getOption('i', options);
      if (fileName.length() != 0) {
        firstInput = new BufferedReader(new FileReader(fileName));
      } else {
        throw new Exception("No first input file given.\n");
      }

      fileName = M5StaticUtils.getOption('r', options);
      if (fileName.length() != 0) {
        secondInput = new BufferedReader(new FileReader(fileName));
      } else {
        throw new Exception("No second input file given.\n");
      }

      fileName = M5StaticUtils.getOption('o', options);
      if (fileName.length() != 0) {
        firstOutput = new PrintWriter(new FileOutputStream(fileName));
      } else {
        firstOutput = new PrintWriter(System.out);
      }

      fileName = M5StaticUtils.getOption('s', options);
      if (fileName.length() != 0) {
        secondOutput = new PrintWriter(new FileOutputStream(fileName));
      } else {
        secondOutput = new PrintWriter(System.out);
      }
      String classIndex = M5StaticUtils.getOption('c', options);

      if (filter instanceof NominalToBinaryFilter) {
        ((NominalToBinaryFilter) filter).setOptions(options);
      }
      M5StaticUtils.checkForRemainingOptions(options);

      if (helpRequest) {
        throw new Exception("Help requested.\n");
      }
      firstData = new M5Instances(firstInput, 1);
      secondData = new M5Instances(secondInput, 1);
      if (!secondData.equalHeaders(firstData)) {
        throw new Exception("Input file formats differ.\n");
      }
      if (classIndex.length() != 0) {
        if (classIndex.equals("first")) {
          firstData.setClassIndex(0);
          secondData.setClassIndex(0);
        } else if (classIndex.equals("last")) {
          firstData.setClassIndex(firstData.numAttributes() - 1);
          secondData.setClassIndex(secondData.numAttributes() - 1);
        } else {
          firstData.setClassIndex(Integer.parseInt(classIndex) - 1);
          secondData.setClassIndex(Integer.parseInt(classIndex) - 1);
        }
      }
    } catch (Exception ex) {
      String filterOptions = "";
      // Output the error and also the valid options
      if (filter instanceof NominalToBinaryFilter) {
        filterOptions += "\nFilter options:\n\n";
        Enumeration enume = ((NominalToBinaryFilter) filter).listOptions();
        while (enume.hasMoreElements()) {
          Information option = (Information) enume.nextElement();
          filterOptions += option.synopsis() + '\n' + option.description() + "\n";
        }
      }

      String genericOptions =
          "\nGeneral options:\n\n"
              + "-h\n"
              + "\tGet help on available options.\n"
              + "-i <filename>\n"
              + "\tThe file containing first input instances.\n"
              + "-o <filename>\n"
              + "\tThe file first output instances will be written to.\n"
              + "-r <filename>\n"
              + "\tThe file containing second input instances.\n"
              + "-s <filename>\n"
              + "\tThe file second output instances will be written to.\n"
              + "-c <class index>\n"
              + "\tThe number of the attribute to use as the class.\n"
              + "\t\"first\" and \"last\" are also valid entries.\n"
              + "\tIf not supplied then no class is assigned.\n";

      throw new Exception('\n' + ex.getMessage() + filterOptions + genericOptions);
    }
    boolean printedHeader = false;
    if (filter.setInputFormat(firstData)) {
      firstOutput.println(filter.getOutputFormat().toString());
      printedHeader = true;
    }

    // Pass all the instances to the filter
    while (firstData.readInstance(firstInput)) {
      if (filter.input(firstData.instance(0))) {
        if (!printedHeader) {
          throw new Error("Filter didn't return true from setInputFormat() " + "earlier!");
        }
        firstOutput.println(filter.output().toString());
      }
      firstData.delete(0);
    }

    // Say that input has finished, and print any pending output instances
    if (filter.batchFinished()) {
      if (!printedHeader) {
        firstOutput.println(filter.getOutputFormat().toString());
      }
      while (filter.numPendingOutput() > 0) {
        firstOutput.println(filter.output().toString());
      }
    }

    if (firstOutput != null) {
      firstOutput.close();
    }
    printedHeader = false;
    if (filter.isOutputFormatDefined()) {
      secondOutput.println(filter.getOutputFormat().toString());
      printedHeader = true;
    }
    // Pass all the second instances to the filter
    while (secondData.readInstance(secondInput)) {
      if (filter.input(secondData.instance(0))) {
        if (!printedHeader) {
          throw new Error("Filter didn't return true from" + " isOutputFormatDefined() earlier!");
        }
        secondOutput.println(filter.output().toString());
      }
      secondData.delete(0);
    }

    // Say that input has finished, and print any pending output instances
    if (filter.batchFinished()) {
      if (!printedHeader) {
        secondOutput.println(filter.getOutputFormat().toString());
      }
      while (filter.numPendingOutput() > 0) {
        secondOutput.println(filter.output().toString());
      }
    }
    if (secondOutput != null) {
      secondOutput.close();
    }
  }
예제 #13
0
  /**
   * Method for testing filters.
   *
   * @param options should contain the following arguments: <br>
   *     -i input_file <br>
   *     -o output_file <br>
   *     -c class_index <br>
   *     or -h for help on options
   * @exception Exception if something goes wrong or the user requests help on command options
   */
  public static void filterFile(NominalToBinaryFilter filter, String[] options) throws Exception {

    boolean debug = false;
    M5Instances data = null;
    Reader input = null;
    PrintWriter output = null;
    boolean helpRequest;

    try {
      helpRequest = M5StaticUtils.getFlag('h', options);

      if (M5StaticUtils.getFlag('d', options)) {
        debug = true;
      }
      String infileName = M5StaticUtils.getOption('i', options);
      String outfileName = M5StaticUtils.getOption('o', options);
      String classIndex = M5StaticUtils.getOption('c', options);

      if (filter instanceof NominalToBinaryFilter) {
        ((NominalToBinaryFilter) filter).setOptions(options);
      }

      M5StaticUtils.checkForRemainingOptions(options);
      if (helpRequest) {
        throw new Exception("Help requested.\n");
      }
      if (infileName.length() != 0) {
        input = new BufferedReader(new FileReader(infileName));
      } else {
        input = new BufferedReader(new InputStreamReader(System.in));
      }
      if (outfileName.length() != 0) {
        output = new PrintWriter(new FileOutputStream(outfileName));
      } else {
        output = new PrintWriter(System.out);
      }

      data = new M5Instances(input, 1);
      if (classIndex.length() != 0) {
        if (classIndex.equals("first")) {
          data.setClassIndex(0);
        } else if (classIndex.equals("last")) {
          data.setClassIndex(data.numAttributes() - 1);
        } else {
          data.setClassIndex(Integer.parseInt(classIndex) - 1);
        }
      }
    } catch (Exception ex) {
      String filterOptions = "";
      // Output the error and also the valid options
      if (filter instanceof NominalToBinaryFilter) {
        filterOptions += "\nFilter options:\n\n";
        Enumeration enuma = ((NominalToBinaryFilter) filter).listOptions();
        while (enuma.hasMoreElements()) {
          Information option = (Information) enuma.nextElement();
          filterOptions += option.synopsis() + '\n' + option.description() + "\n";
        }
      }

      String genericOptions =
          "\nGeneral options:\n\n"
              + "-h\n"
              + "\tGet help on available options.\n"
              + "\t(use -b -h for help on batch mode.)\n"
              + "-i <file>\n"
              + "\tThe name of the file containing input instances.\n"
              + "\tIf not supplied then instances will be read from stdin.\n"
              + "-o <file>\n"
              + "\tThe name of the file output instances will be written to.\n"
              + "\tIf not supplied then instances will be written to stdout.\n"
              + "-c <class index>\n"
              + "\tThe number of the attribute to use as the class.\n"
              + "\t\"first\" and \"last\" are also valid entries.\n"
              + "\tIf not supplied then no class is assigned.\n";

      throw new Exception('\n' + ex.getMessage() + filterOptions + genericOptions);
    }

    if (debug) {
      System.err.println("Setting input format");
    }
    boolean printedHeader = false;
    if (filter.setInputFormat(data)) {
      if (debug) {
        System.err.println("Getting output format");
      }
      output.println(filter.getOutputFormat().toString());
      printedHeader = true;
    }

    // Pass all the instances to the filter
    while (data.readInstance(input)) {
      if (debug) {
        System.err.println("Input instance to filter");
      }
      if (filter.input(data.instance(0))) {
        if (debug) {
          System.err.println("Filter said collect immediately");
        }
        if (!printedHeader) {
          throw new Error("Filter didn't return true from setInputFormat() " + "earlier!");
        }
        if (debug) {
          System.err.println("Getting output instance");
        }
        output.println(filter.output().toString());
      }
      data.delete(0);
    }

    // Say that input has finished, and print any pending output instances
    if (debug) {
      System.err.println("Setting end of batch");
    }
    if (filter.batchFinished()) {
      if (debug) {
        System.err.println("Filter said collect output");
      }
      if (!printedHeader) {
        if (debug) {
          System.err.println("Getting output format");
        }
        output.println(filter.getOutputFormat().toString());
      }
      if (debug) {
        System.err.println("Getting output instance");
      }
      while (filter.numPendingOutput() > 0) {
        output.println(filter.output().toString());
        if (debug) {
          System.err.println("Getting output instance");
        }
      }
    }
    if (debug) {
      System.err.println("Done");
    }

    if (output != null) {
      output.close();
    }
  }