Beispiel #1
0
  @Override
  public Instances getDataSet() throws IOException {

    if (m_sourceReader == null) {
      throw new IOException("No source has been specified");
    }

    if (getRetrieval() == INCREMENTAL) {
      throw new IOException("Cannot mix getting instances in both incremental and batch modes");
    }
    setRetrieval(BATCH);

    if (m_structure == null) {
      getStructure();
    }

    while (readData(true)) {;
    }

    m_dataDumper.flush();
    m_dataDumper.close();

    // make final structure
    makeStructure();

    Reader sr = new BufferedReader(new FileReader(m_tempFile));
    ArffReader initialArff = new ArffReader(sr, m_structure, 0, m_fieldSeparatorAndEnclosures);

    Instances initialInsts = initialArff.getData();
    sr.close();
    initialArff = null;

    return initialInsts;
  }
Beispiel #2
0
  /**
   * Return the full data set. If the structure hasn't yet been determined by a call to getStructure
   * then method should do so before processing the rest of the data set.
   *
   * @return the structure of the data set as an empty set of Instances
   * @exception IOException if there is no source or parsing fails
   */
  @Override
  public Instances getDataSet() throws IOException {
    if ((m_sourceFile == null) && (m_sourceReader == null)) {
      throw new IOException("No source has been specified");
    }

    if (m_structure == null) {
      getStructure();
    }

    if (m_st == null) {
      m_st = new StreamTokenizer(m_sourceReader);
      initTokenizer(m_st);
    }

    m_st.ordinaryChar(m_FieldSeparator.charAt(0));

    m_cumulativeStructure = new ArrayList<Hashtable<Object, Integer>>(m_structure.numAttributes());
    for (int i = 0; i < m_structure.numAttributes(); i++) {
      m_cumulativeStructure.add(new Hashtable<Object, Integer>());
    }

    m_cumulativeInstances = new ArrayList<ArrayList<Object>>();
    ArrayList<Object> current;
    while ((current = getInstance(m_st)) != null) {
      m_cumulativeInstances.add(current);
    }

    ArrayList<Attribute> atts = new ArrayList<Attribute>(m_structure.numAttributes());
    for (int i = 0; i < m_structure.numAttributes(); i++) {
      String attname = m_structure.attribute(i).name();
      Hashtable<Object, Integer> tempHash = m_cumulativeStructure.get(i);
      if (tempHash.size() == 0) {
        atts.add(new Attribute(attname));
      } else {
        if (m_StringAttributes.isInRange(i)) {
          atts.add(new Attribute(attname, (ArrayList<String>) null));
        } else {
          ArrayList<String> values = new ArrayList<String>(tempHash.size());
          // add dummy objects in order to make the ArrayList's size == capacity
          for (int z = 0; z < tempHash.size(); z++) {
            values.add("dummy");
          }
          Enumeration e = tempHash.keys();
          while (e.hasMoreElements()) {
            Object ob = e.nextElement();
            //	  if (ob instanceof Double) {
            int index = ((Integer) tempHash.get(ob)).intValue();
            String s = ob.toString();
            if (s.startsWith("'") || s.startsWith("\"")) s = s.substring(1, s.length() - 1);
            values.set(index, new String(s));
            //	  }
          }
          atts.add(new Attribute(attname, values));
        }
      }
    }

    // make the instances
    String relationName;
    if (m_sourceFile != null)
      relationName = (m_sourceFile.getName()).replaceAll("\\.[cC][sS][vV]$", "");
    else relationName = "stream";
    Instances dataSet = new Instances(relationName, atts, m_cumulativeInstances.size());

    for (int i = 0; i < m_cumulativeInstances.size(); i++) {
      current = m_cumulativeInstances.get(i);
      double[] vals = new double[dataSet.numAttributes()];
      for (int j = 0; j < current.size(); j++) {
        Object cval = current.get(j);
        if (cval instanceof String) {
          if (((String) cval).compareTo(m_MissingValue) == 0) {
            vals[j] = Utils.missingValue();
          } else {
            if (dataSet.attribute(j).isString()) {
              vals[j] = dataSet.attribute(j).addStringValue((String) cval);
            } else if (dataSet.attribute(j).isNominal()) {
              // find correct index
              Hashtable<Object, Integer> lookup = m_cumulativeStructure.get(j);
              int index = ((Integer) lookup.get(cval)).intValue();
              vals[j] = index;
            } else {
              throw new IllegalStateException(
                  "Wrong attribute type at position " + (i + 1) + "!!!");
            }
          }
        } else if (dataSet.attribute(j).isNominal()) {
          // find correct index
          Hashtable<Object, Integer> lookup = m_cumulativeStructure.get(j);
          int index = ((Integer) lookup.get(cval)).intValue();
          vals[j] = index;
        } else if (dataSet.attribute(j).isString()) {
          vals[j] = dataSet.attribute(j).addStringValue("" + cval);
        } else {
          vals[j] = ((Double) cval).doubleValue();
        }
      }
      dataSet.add(new DenseInstance(1.0, vals));
    }
    m_structure = new Instances(dataSet, 0);
    setRetrieval(BATCH);
    m_cumulativeStructure = null; // conserve memory

    // close the stream
    m_sourceReader.close();

    return dataSet;
  }