Beispiel #1
0
  public weka.core.Instances toWekaInstances() {
    // attributes
    FastVector wattrs = new FastVector();
    Iterator itr = attributes.iterator();
    while (itr.hasNext()) {
      Attribute attr = (Attribute) itr.next();
      wattrs.addElement(attr.toWekaAttribute());
    }
    // data instances
    weka.core.Instances winsts = new weka.core.Instances(name, wattrs, instances.size());
    itr = instances.iterator();

    while (itr.hasNext()) {
      Instance inst = (Instance) itr.next();
      Iterator itrval = inst.getValues().iterator();
      Iterator itrmis = inst.getMissing().iterator();
      double[] vals = new double[wattrs.size()];
      for (int i = 0; i < wattrs.size(); i++) {
        double val = (Double) itrval.next();
        if ((Boolean) itrmis.next()) {
          vals[i] = weka.core.Instance.missingValue();
        } else {
          vals[i] = val;
        }
      }
      weka.core.Instance winst = new weka.core.Instance(1, vals);
      winst.setDataset(winsts);
      winsts.add(winst);
    }
    winsts.setClassIndex(this.class_index);
    return winsts;
  }
Beispiel #2
0
  /**
   * Input an instance for filtering. Ordinarily the instance is processed and made available for
   * output immediately. Some filters require all instances be read before producing output.
   *
   * @param instance the input instance
   * @return true if the filtered instance may now be collected with output().
   * @exception IllegalStateException if no input format has been defined.
   * @exception Exception if there was a problem during the filtering.
   */
  public boolean input(Instance instance) throws Exception {

    if (getInputFormat() == null) {
      throw new IllegalStateException("No input instance format defined");
    }
    if (m_NewBatch) {
      resetQueue();
      m_NewBatch = false;
    }

    double[] vals = new double[instance.numAttributes() + 1];
    for (int i = 0; i < instance.numAttributes(); i++) {
      if (instance.isMissing(i)) {
        vals[i] = Instance.missingValue();
      } else {
        vals[i] = instance.value(i);
      }
    }

    evaluateExpression(vals);

    Instance inst = null;
    if (instance instanceof SparseInstance) {
      inst = new SparseInstance(instance.weight(), vals);
    } else {
      inst = new Instance(instance.weight(), vals);
    }
    copyStringValues(inst, false, instance.dataset(), getOutputFormat());
    inst.setDataset(getOutputFormat());
    push(inst);
    return true;
  }
  // calculate if an App fits to a pm
  // TODO: gst: use WEKA to calc fit factor!!
  private int calculateFit(App app2, VirtualMachine vm) {
    int output = 0;
    if (Action.isOnlyLearning() == false && CreateAppInsertIntoVm.evaluation != null) {
      // is free space available in the VM
      if (app2.getCpu() + vm.getCurrentCpuUsage() < vm.getCurrentCpuAllocation()
          && app2.getMemory() + vm.getCurrentMemoryUsage() < vm.getCurrentMemoryAllocation()
          && app2.getStorage() + vm.getCurrentStorageUsage() < vm.getCurrentCpuAllocation()) {

        Instance instance = createInstance(Instance.missingValue(), vm);
        instance.setDataset(CreateAppInsertIntoVm.getKnowledgeBase());

        try {
          output = (int) (evaluation.evaluateModelOnce(classifier, instance) * 100);
        } catch (Exception e) {
          e.printStackTrace();
        }
      }
    } else {
      if (app2.getCpu() + vm.getCurrentCpuUsage() < vm.getCurrentCpuAllocation()
          && app2.getMemory() + vm.getCurrentMemoryUsage() < vm.getCurrentMemoryAllocation()
          && app2.getStorage() + vm.getCurrentStorageUsage() < vm.getCurrentCpuAllocation()) {
        output = randomData.nextInt(1, 100);
      }
    }
    return output;
  }
Beispiel #4
0
  // use the TriTrainer Classifier to classify Instance;
  public double classifyInstance(Instance instance) throws Exception {
    double result;
    double[] dist;
    int index;
    dist = distributionForInstance(instance); // 分类概率

    if (instance.classAttribute().isNominal()) {
      index = Utils.maxIndex(dist); // 返回概率最大的
      if (dist[index] == 0) result = Instance.missingValue();
      else result = dist[index];
    } else if (instance.classAttribute().isNumeric()) {
      result = dist[0];
    } else {
      result = Instance.missingValue();
    }
    return result;
  }
Beispiel #5
0
  /**
   * Evaluate the expression using the supplied array of attribute values. The result is stored in
   * the last element of the array. Assumes that the infix expression has been converted to postfix
   * and stored in m_postFixExpVector
   *
   * @param vals the values to apply the expression to
   * @exception Exception if something goes wrong
   */
  private void evaluateExpression(double[] vals) throws Exception {
    Stack operands = new Stack();

    for (int i = 0; i < m_postFixExpVector.size(); i++) {
      Object nextob = m_postFixExpVector.elementAt(i);
      if (nextob instanceof NumericOperand) {
        operands.push(new Double(((NumericOperand) nextob).m_numericConst));
      } else if (nextob instanceof AttributeOperand) {
        double value = vals[((AttributeOperand) nextob).m_attributeIndex];
        if (value == Instance.missingValue()) {
          vals[vals.length - 1] = Instance.missingValue();
          break;
        }
        if (((AttributeOperand) nextob).m_negative) {
          value = -value;
        }
        operands.push(new Double(value));
      } else if (nextob instanceof Operator) {
        char op = ((Operator) nextob).m_operator;
        if (isUnaryFunction(op)) {
          double operand = ((Double) operands.pop()).doubleValue();
          double result = ((Operator) nextob).applyFunction(operand);
          operands.push(new Double(result));
        } else {
          double second = ((Double) operands.pop()).doubleValue();
          double first = ((Double) operands.pop()).doubleValue();
          double result = ((Operator) nextob).applyOperator(first, second);
          operands.push(new Double(result));
        }
      } else {
        throw new Exception("Unknown object in postfix vector!");
      }
    }

    if (operands.size() != 1) {
      throw new Exception("Problem applying function");
    }

    Double result = ((Double) operands.pop());
    if (result.isNaN() || result.isInfinite()) {
      vals[vals.length - 1] = Instance.missingValue();
    } else {
      vals[vals.length - 1] = result.doubleValue();
    }
  }
Beispiel #6
0
  private List<Instance> myExtractKeyphrases(String document, int numOfPhrases) throws Exception {

    // Check whether there is actually any data
    //
    if (document.length() == 0 || document.equals("")) {
      throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    List<Instance> myInstances = new ArrayList<Instance>();

    double[] newInst = new double[2];
    newInst[0] = (double) data.attribute(0).addStringValue(document);
    newInst[1] = Instance.missingValue();

    data.add(new Instance(1.0, newInst));

    m_KEAFilter.input(data.instance(0));

    data = data.stringFreeStructure();

    ke.setNumPhrases(numOfPhrases);

    int numPhrases = numOfPhrases; // ke.getNumPhrases();

    Instance[] topRankedInstances = new Instance[numPhrases];
    Instance inst;

    // Iterating over all extracted keyphrases (inst)
    while ((inst = m_KEAFilter.output()) != null) {
      int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;

      if (index < numPhrases) {
        topRankedInstances[index] = inst;
      }
    }

    double numExtracted = 0, numCorrect = 0;

    for (int i = 0; i < numPhrases; i++) {
      if (topRankedInstances[i] != null) {
        if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
          numExtracted += 1.0;
        }
        if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
          numCorrect += 1.0;
        }
        myInstances.add(topRankedInstances[i]);
      }
    }

    return myInstances;
  }
  /**
   * Classifies the given test instance. The instance has to belong to a dataset when it's being
   * classified. Note that a classifier MUST implement either this or distributionForInstance().
   *
   * @param instance the instance to be classified
   * @return the predicted most likely class for the instance or Instance.missingValue() if no
   *     prediction is made
   * @exception Exception if an error occurred during the prediction
   */
  public double classifyInstance(Instance instance) throws Exception {

    double[] dist = distributionForInstance(instance);
    if (dist == null) {
      throw new Exception("Null distribution predicted");
    }

    double max = 0;
    int maxIndex = 0;

    for (int i = 0; i < dist.length; i++) {
      if (dist[i] > max) {
        maxIndex = i;
        max = dist[i];
      }
    }
    if (max > 0) {
      return maxIndex;
    } else {
      return Instance.missingValue();
    }
  }
Beispiel #8
0
  /**
   * Creates a new instance the same as one instance (the "destination") but with some attribute
   * values copied from another instance (the "source")
   *
   * @param source the source instance
   * @param dest the destination instance
   * @return the new merged instance
   */
  protected Instance mergeInstances(Instance source, Instance dest) {

    Instances outputFormat = outputFormatPeek();
    double[] vals = new double[outputFormat.numAttributes()];
    for (int i = 0; i < vals.length; i++) {
      if ((i != outputFormat.classIndex()) && (m_SelectedCols.isInRange(i))) {
        if (source != null) {
          vals[i] = source.value(i);
        } else {
          vals[i] = Instance.missingValue();
        }
      } else {
        vals[i] = dest.value(i);
      }
    }
    Instance inst = null;
    if (dest instanceof SparseInstance) {
      inst = new SparseInstance(dest.weight(), vals);
    } else {
      inst = new Instance(dest.weight(), vals);
    }
    inst.setDataset(dest.dataset());
    return inst;
  }
Beispiel #9
0
  /**
   * Makes a database query to convert a table into a set of instances
   *
   * @param query the query to convert to instances
   * @return the instances contained in the result of the query, NULL if the SQL query doesn't
   *     return a ResultSet, e.g., DELETE/INSERT/UPDATE
   * @throws Exception if an error occurs
   */
  public Instances retrieveInstances(String query) throws Exception {

    if (m_Debug) System.err.println("Executing query: " + query);
    connectToDatabase();
    if (execute(query) == false) {
      if (m_PreparedStatement.getUpdateCount() == -1) {
        throw new Exception("Query didn't produce results");
      } else {
        if (m_Debug) System.err.println(m_PreparedStatement.getUpdateCount() + " rows affected.");
        close();
        return null;
      }
    }
    ResultSet rs = getResultSet();
    if (m_Debug) System.err.println("Getting metadata...");
    ResultSetMetaData md = rs.getMetaData();
    if (m_Debug) System.err.println("Completed getting metadata...");

    // Determine structure of the instances
    int numAttributes = md.getColumnCount();
    int[] attributeTypes = new int[numAttributes];
    Hashtable[] nominalIndexes = new Hashtable[numAttributes];
    FastVector[] nominalStrings = new FastVector[numAttributes];
    for (int i = 1; i <= numAttributes; i++) {
      /* switch (md.getColumnType(i)) {
      case Types.CHAR:
      case Types.VARCHAR:
      case Types.LONGVARCHAR:
      case Types.BINARY:
      case Types.VARBINARY:
      case Types.LONGVARBINARY:*/

      switch (translateDBColumnType(md.getColumnTypeName(i))) {
        case STRING:
          // System.err.println("String --> nominal");
          attributeTypes[i - 1] = Attribute.NOMINAL;
          nominalIndexes[i - 1] = new Hashtable();
          nominalStrings[i - 1] = new FastVector();
          break;
        case TEXT:
          // System.err.println("Text --> string");
          attributeTypes[i - 1] = Attribute.STRING;
          nominalIndexes[i - 1] = new Hashtable();
          nominalStrings[i - 1] = new FastVector();
          break;
        case BOOL:
          // System.err.println("boolean --> nominal");
          attributeTypes[i - 1] = Attribute.NOMINAL;
          nominalIndexes[i - 1] = new Hashtable();
          nominalIndexes[i - 1].put("false", new Double(0));
          nominalIndexes[i - 1].put("true", new Double(1));
          nominalStrings[i - 1] = new FastVector();
          nominalStrings[i - 1].addElement("false");
          nominalStrings[i - 1].addElement("true");
          break;
        case DOUBLE:
          // System.err.println("BigDecimal --> numeric");
          attributeTypes[i - 1] = Attribute.NUMERIC;
          break;
        case BYTE:
          // System.err.println("byte --> numeric");
          attributeTypes[i - 1] = Attribute.NUMERIC;
          break;
        case SHORT:
          // System.err.println("short --> numeric");
          attributeTypes[i - 1] = Attribute.NUMERIC;
          break;
        case INTEGER:
          // System.err.println("int --> numeric");
          attributeTypes[i - 1] = Attribute.NUMERIC;
          break;
        case LONG:
          // System.err.println("long --> numeric");
          attributeTypes[i - 1] = Attribute.NUMERIC;
          break;
        case FLOAT:
          // System.err.println("float --> numeric");
          attributeTypes[i - 1] = Attribute.NUMERIC;
          break;
        case DATE:
          attributeTypes[i - 1] = Attribute.DATE;
          break;
        case TIME:
          attributeTypes[i - 1] = Attribute.DATE;
          break;
        default:
          // System.err.println("Unknown column type");
          attributeTypes[i - 1] = Attribute.STRING;
      }
    }

    // For sqlite
    // cache column names because the last while(rs.next()) { iteration for
    // the tuples below will close the md object:
    Vector<String> columnNames = new Vector<String>();
    for (int i = 0; i < numAttributes; i++) {
      columnNames.add(md.getColumnName(i + 1));
    }

    // Step through the tuples
    if (m_Debug) System.err.println("Creating instances...");
    FastVector instances = new FastVector();
    int rowCount = 0;
    while (rs.next()) {
      if (rowCount % 100 == 0) {
        if (m_Debug) {
          System.err.print("read " + rowCount + " instances \r");
          System.err.flush();
        }
      }
      double[] vals = new double[numAttributes];
      for (int i = 1; i <= numAttributes; i++) {
        /*switch (md.getColumnType(i)) {
        case Types.CHAR:
        case Types.VARCHAR:
        case Types.LONGVARCHAR:
        case Types.BINARY:
        case Types.VARBINARY:
        case Types.LONGVARBINARY:*/
        switch (translateDBColumnType(md.getColumnTypeName(i))) {
          case STRING:
            String str = rs.getString(i);

            if (rs.wasNull()) {
              vals[i - 1] = Instance.missingValue();
            } else {
              Double index = (Double) nominalIndexes[i - 1].get(str);
              if (index == null) {
                index = new Double(nominalStrings[i - 1].size());
                nominalIndexes[i - 1].put(str, index);
                nominalStrings[i - 1].addElement(str);
              }
              vals[i - 1] = index.doubleValue();
            }
            break;
          case TEXT:
            String txt = rs.getString(i);

            if (rs.wasNull()) {
              vals[i - 1] = Instance.missingValue();
            } else {
              Double index = (Double) nominalIndexes[i - 1].get(txt);
              if (index == null) {
                index = new Double(nominalStrings[i - 1].size());
                nominalIndexes[i - 1].put(txt, index);
                nominalStrings[i - 1].addElement(txt);
              }
              vals[i - 1] = index.doubleValue();
            }
            break;
          case BOOL:
            boolean boo = rs.getBoolean(i);
            if (rs.wasNull()) {
              vals[i - 1] = Instance.missingValue();
            } else {
              vals[i - 1] = (boo ? 1.0 : 0.0);
            }
            break;
          case DOUBLE:
            //	  BigDecimal bd = rs.getBigDecimal(i, 4);
            double dd = rs.getDouble(i);
            // Use the column precision instead of 4?
            if (rs.wasNull()) {
              vals[i - 1] = Instance.missingValue();
            } else {
              //	    newInst.setValue(i - 1, bd.doubleValue());
              vals[i - 1] = dd;
            }
            break;
          case BYTE:
            byte by = rs.getByte(i);
            if (rs.wasNull()) {
              vals[i - 1] = Instance.missingValue();
            } else {
              vals[i - 1] = (double) by;
            }
            break;
          case SHORT:
            short sh = rs.getShort(i);
            if (rs.wasNull()) {
              vals[i - 1] = Instance.missingValue();
            } else {
              vals[i - 1] = (double) sh;
            }
            break;
          case INTEGER:
            int in = rs.getInt(i);
            if (rs.wasNull()) {
              vals[i - 1] = Instance.missingValue();
            } else {
              vals[i - 1] = (double) in;
            }
            break;
          case LONG:
            long lo = rs.getLong(i);
            if (rs.wasNull()) {
              vals[i - 1] = Instance.missingValue();
            } else {
              vals[i - 1] = (double) lo;
            }
            break;
          case FLOAT:
            float fl = rs.getFloat(i);
            if (rs.wasNull()) {
              vals[i - 1] = Instance.missingValue();
            } else {
              vals[i - 1] = (double) fl;
            }
            break;
          case DATE:
            Date date = rs.getDate(i);
            if (rs.wasNull()) {
              vals[i - 1] = Instance.missingValue();
            } else {
              // TODO: Do a value check here.
              vals[i - 1] = (double) date.getTime();
            }
            break;
          case TIME:
            Time time = rs.getTime(i);
            if (rs.wasNull()) {
              vals[i - 1] = Instance.missingValue();
            } else {
              // TODO: Do a value check here.
              vals[i - 1] = (double) time.getTime();
            }
            break;
          default:
            vals[i - 1] = Instance.missingValue();
        }
      }
      Instance newInst;
      if (m_CreateSparseData) {
        newInst = new SparseInstance(1.0, vals);
      } else {
        newInst = new Instance(1.0, vals);
      }
      instances.addElement(newInst);
      rowCount++;
    }
    // disconnectFromDatabase();  (perhaps other queries might be made)

    // Create the header and add the instances to the dataset
    if (m_Debug) System.err.println("Creating header...");
    FastVector attribInfo = new FastVector();
    for (int i = 0; i < numAttributes; i++) {
      /* Fix for databases that uppercase column names */
      // String attribName = attributeCaseFix(md.getColumnName(i + 1));
      String attribName = attributeCaseFix(columnNames.get(i));
      switch (attributeTypes[i]) {
        case Attribute.NOMINAL:
          attribInfo.addElement(new Attribute(attribName, nominalStrings[i]));
          break;
        case Attribute.NUMERIC:
          attribInfo.addElement(new Attribute(attribName));
          break;
        case Attribute.STRING:
          Attribute att = new Attribute(attribName, (FastVector) null);
          attribInfo.addElement(att);
          for (int n = 0; n < nominalStrings[i].size(); n++) {
            att.addStringValue((String) nominalStrings[i].elementAt(n));
          }
          break;
        case Attribute.DATE:
          attribInfo.addElement(new Attribute(attribName, (String) null));
          break;
        default:
          throw new Exception("Unknown attribute type");
      }
    }
    Instances result = new Instances("QueryResult", attribInfo, instances.size());
    for (int i = 0; i < instances.size(); i++) {
      result.add((Instance) instances.elementAt(i));
    }
    close(rs);

    return result;
  }
  /** Builds the model from the files */
  public void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    if (stems.size() == 0) {
      throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Extract keyphrases
    Enumeration elem = stems.keys();
    while (elem.hasMoreElements()) {
      String str = (String) elem.nextElement();
      double[] newInst = new double[2];
      try {
        File txt = new File(m_dirName + "/" + str + ".txt");
        Reader is;
        if (!m_encoding.equals("default")) {
          is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding);
        } else {
          is = new BomStrippingInputStreamReader(new FileInputStream(txt));
        }
        StringBuffer txtStr = new StringBuffer();
        int c;
        while ((c = is.read()) != -1) {
          txtStr.append((char) c);
        }
        newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
      } catch (Exception e) {
        if (m_debug) {
          System.err.println("Can't read document " + str + ".txt");
        }
        newInst[0] = Instance.missingValue();
      }
      try {
        File key = new File(m_dirName + "/" + str + ".key");
        Reader is;
        if (!m_encoding.equals("default")) {
          is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding);
        } else {
          is = new BomStrippingInputStreamReader(new FileInputStream(key));
        }
        StringBuffer keyStr = new StringBuffer();
        int c;
        while ((c = is.read()) != -1) {
          keyStr.append((char) c);
        }
        newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
      } catch (Exception e) {
        if (m_debug) {
          System.err.println("No keyphrases for stem " + str + ".");
        }
        newInst[1] = Instance.missingValue();
      }
      data.add(new Instance(1.0, newInst));
      m_KEAFilter.input(data.instance(0));
      data = data.stringFreeStructure();
      if (m_debug) {
        System.err.println("-- Document: " + str);
      }
      Instance[] topRankedInstances = new Instance[m_numPhrases];
      Instance inst;
      while ((inst = m_KEAFilter.output()) != null) {
        int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;
        if (index < m_numPhrases) {
          topRankedInstances[index] = inst;
        }
      }
      if (m_debug) {
        System.err.println("-- Keyphrases and feature values:");
      }
      FileOutputStream out = null;
      PrintWriter printer = null;
      File key = new File(m_dirName + "/" + str + ".key");
      if (!key.exists()) {
        out = new FileOutputStream(m_dirName + "/" + str + ".key");
        if (!m_encoding.equals("default")) {
          printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));
        } else {
          printer = new PrintWriter(out);
        }
      }
      double numExtracted = 0, numCorrect = 0;
      for (int i = 0; i < m_numPhrases; i++) {
        if (topRankedInstances[i] != null) {
          if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
            numExtracted += 1.0;
          }
          if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1)
              == topRankedInstances[i]
                  .attribute(topRankedInstances[i].numAttributes() - 1)
                  .indexOfValue("True")) {
            numCorrect += 1.0;
          }
          if (printer != null) {
            printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));
            if (m_AdditionalInfo) {
              printer.print("\t");
              printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
              printer.print("\t");
              printer.print(
                  Utils.doubleToString(
                      topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
            }
            printer.println();
          }
          if (m_debug) {
            System.err.println(topRankedInstances[i]);
          }
        }
      }
      if (numExtracted > 0) {
        if (m_debug) {
          System.err.println("-- " + numCorrect + " correct");
        }
        stats.addElement(new Double(numCorrect));
      }
      if (printer != null) {
        printer.flush();
        printer.close();
        out.close();
      }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
      st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));
    System.err.println(
        "Avg. number of correct keyphrases: "
            + Utils.doubleToString(avg, 2)
            + " +/- "
            + Utils.doubleToString(stdDev, 2));
    System.err.println("Based on " + stats.size() + " documents");
    m_KEAFilter.batchFinished();
  }
  /** Builds the model from the training data */
  public void buildModel(HashSet<String> fileNames) throws Exception {

    // Check whether there is actually any data
    if (fileNames.size() == 0) {
      throw new Exception("Couldn't find any data in " + inputDirectoryName);
    }

    System.err.println("-- Building the model... ");

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("filename", (FastVector) null));
    atts.addElement(new Attribute("document", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Build model
    mauiFilter = new MauiFilter();

    mauiFilter.setDebug(getDebug());
    mauiFilter.setMaxPhraseLength(getMaxPhraseLength());
    mauiFilter.setMinPhraseLength(getMinPhraseLength());
    mauiFilter.setMinNumOccur(getMinNumOccur());
    mauiFilter.setStemmer(getStemmer());
    mauiFilter.setDocumentLanguage(getDocumentLanguage());
    mauiFilter.setVocabularyName(getVocabularyName());
    mauiFilter.setVocabularyFormat(getVocabularyFormat());
    mauiFilter.setStopwords(getStopwords());

    if (wikipedia != null) {
      mauiFilter.setWikipedia(wikipedia);
    } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) {
      mauiFilter.setWikipedia(wikipedia);
    } else {
      mauiFilter.setWikipedia(
          wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory);
    }

    if (classifier != null) {
      mauiFilter.setClassifier(classifier);
    }

    mauiFilter.setInputFormat(data);

    // set features configurations
    mauiFilter.setBasicFeatures(useBasicFeatures);
    mauiFilter.setKeyphrasenessFeature(useKeyphrasenessFeature);
    mauiFilter.setFrequencyFeatures(useFrequencyFeatures);
    mauiFilter.setPositionsFeatures(usePositionsFeatures);
    mauiFilter.setLengthFeature(useLengthFeature);
    mauiFilter.setThesaurusFeatures(useNodeDegreeFeature);
    mauiFilter.setBasicWikipediaFeatures(useBasicWikipediaFeatures);
    mauiFilter.setAllWikipediaFeatures(useAllWikipediaFeatures);
    mauiFilter.setThesaurusFeatures(useNodeDegreeFeature);

    mauiFilter.setClassifier(classifier);

    mauiFilter.setContextSize(contextSize);
    mauiFilter.setMinKeyphraseness(minKeyphraseness);
    mauiFilter.setMinSenseProbability(minSenseProbability);

    if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) {
      mauiFilter.loadThesaurus(getStemmer(), getStopwords());
    }

    System.err.println("-- Reading the input documents... ");

    for (String fileName : fileNames) {

      double[] newInst = new double[3];

      newInst[0] = (double) data.attribute(0).addStringValue(fileName);
      ;

      File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt");
      File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key");

      try {

        InputStreamReader is;
        if (!documentEncoding.equals("default")) {
          is = new InputStreamReader(new FileInputStream(documentTextFile), documentEncoding);
        } else {
          is = new InputStreamReader(new FileInputStream(documentTextFile));
        }

        // Reading the file content
        StringBuffer txtStr = new StringBuffer();
        int c;
        while ((c = is.read()) != -1) {
          txtStr.append((char) c);
        }
        is.close();

        // Adding the text of the document to the instance
        newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString());

      } catch (Exception e) {

        System.err.println("Problem with reading " + documentTextFile);
        e.printStackTrace();
        newInst[1] = Instance.missingValue();
      }

      try {

        InputStreamReader is;
        if (!documentEncoding.equals("default")) {
          is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding);
        } else {
          is = new InputStreamReader(new FileInputStream(documentTopicsFile));
        }

        // Reading the content of the keyphrase file
        StringBuffer keyStr = new StringBuffer();
        int c;
        while ((c = is.read()) != -1) {
          keyStr.append((char) c);
        }

        // Adding the topics to the file
        newInst[2] = (double) data.attribute(2).addStringValue(keyStr.toString());

      } catch (Exception e) {

        System.err.println("Problem with reading " + documentTopicsFile);
        e.printStackTrace();
        newInst[2] = Instance.missingValue();
      }

      data.add(new Instance(1.0, newInst));

      mauiFilter.input(data.instance(0));
      data = data.stringFreeStructure();
    }
    mauiFilter.batchFinished();

    while ((mauiFilter.output()) != null) {}
    ;
  }
Beispiel #12
0
  /**
   * ************************************************** Convert a table to a set of instances, with
   * <b>columns</b> representing individual </b>instances</b> and <b>rows</b> representing
   * <b>attributes</b> (e.g. as is common with microarray data)
   */
  public Instances tableColsToInstances(Table t, String relationName) {

    System.err.print("Converting table cols to instances...");

    // Set up attributes, which for colInstances will be the rowNames...
    FastVector atts = new FastVector();
    ArrayList<Boolean> isNominal = new ArrayList<Boolean>();
    ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later...

    System.err.print("creating attributes...");

    for (int r = 0; r < t.numRows; r++) {
      if (rowIsNumeric(t, r)) {
        isNominal.add(false);
        atts.addElement(new Attribute(t.rowNames[r]));
        allAttVals.add(null); // No enumeration of attribute values.
      } else {
        // It's nominal... determine the range of values and create a nominal attribute...
        isNominal.add(true);
        FastVector attVals = getRowValues(t, r);
        atts.addElement(new Attribute(t.rowNames[r], attVals));
        // Save it for later
        allAttVals.add(attVals);
      }
    }

    System.err.print("creating instances...");

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    /** ***** CREATE INSTANCES ************* */
    // Fill the instances with data...
    // For each instance...
    for (int c = 0; c < t.numCols; c++) {
      double[] vals =
          new double[data.numAttributes()]; // Even nominal values are stored as double pointers.

      // For each attribute fill in the numeric or attributeValue index...
      for (int r = 0; r < t.numRows; r++) {
        String val = (String) t.matrix.getQuick(r, c);
        if (val == "?") vals[r] = Instance.missingValue();
        else if (isNominal.get(r)) {
          vals[r] = allAttVals.get(r).indexOf(val);
        } else {
          vals[r] = Double.parseDouble((String) val);
        }
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    System.err.print("add feature names...");

    /** ***** ADD FEATURE NAMES ************* */
    // takes basically zero time... all time is in previous 2 chunks.
    if (addInstanceNamesAsFeatures) {
      Instances newData = new Instances(data);
      newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
      int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

      // We save the instanceNames in a list because it's handy later on...
      instanceNames = new ArrayList<String>();

      for (int c = 0; c < t.colNames.length; c++) {
        instanceNames.add(t.colNames[c]);
        newData.instance(c).setValue(attrIdx, t.colNames[c]);
      }
      data = newData;
    }

    System.err.println("done.");

    return (data);
  }
Beispiel #13
0
  /**
   * ************************************************** Convert a table to a set of instances, with
   * <b>rows</b> representing individual </b>instances</b> and <b>columns</b> representing
   * <b>attributes</b>
   */
  public Instances tableRowsToNominalInstances(Table t, String relationName) {

    System.err.print("Converting table rows to instances...");

    // Set up attributes, which for rowInstances will be the colNames...
    FastVector atts = new FastVector();
    ArrayList<Boolean> isNominal = new ArrayList<Boolean>();
    ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later...			

    System.err.print("creating attributes...");

    for (int c = 0; c < t.numCols; c++) {
      // It's nominal... determine the range of values
      isNominal.add(true);
      FastVector attVals = getColValues(t, c);
      atts.addElement(new Attribute(t.colNames[c], attVals));
      // Save it for later
      allAttVals.add(attVals);
    }

    System.err.print("creating instances...");

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    // Fill the instances with data...
    // For each instance...
    for (int r = 0; r < t.numRows; r++) {
      double[] vals = new double[data.numAttributes()];

      // for each attribute
      for (int c = 0; c < t.numCols; c++) {
        String val = (String) t.matrix.getQuick(r, c);
        if (val == "?") vals[c] = Instance.missingValue();
        else if (isNominal.get(c)) {
          vals[c] = allAttVals.get(c).indexOf(val);
        } else {
          vals[c] = Double.parseDouble((String) val);
        }
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    System.err.print("add feature names...");

    if (addInstanceNamesAsFeatures) {
      Instances newData = new Instances(data);
      newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
      int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

      // We save the instanceNames in a list because it's handy later on...
      instanceNames = new ArrayList<String>();

      for (int r = 0; r < t.rowNames.length; r++) {
        instanceNames.add(t.rowNames[r]);
        newData.instance(r).setValue(attrIdx, t.rowNames[r]);
      }
      data = newData;
    }

    System.err.println("done.");

    return (data);
  }
Beispiel #14
0
  /**
   * If we know in advance that the table is numeric, can optimize a lot... For example, on 9803 x
   * 294 table, TableFileLoader.readNumeric takes 6s compared to 12s for WekaMine readFromTable.
   */
  public static Instances readNumeric(String fileName, String relationName, String delimiter)
      throws Exception {

    int numAttributes = FileUtils.fastCountLines(fileName) - 1; // -1 exclude heading.
    String[] attrNames = new String[numAttributes];

    // Read the col headings and figure out the number of columns in the table..
    BufferedReader reader = new BufferedReader(new FileReader(fileName), 4194304);
    String line = reader.readLine();
    String[] instanceNames = parseColNames(line, delimiter);
    int numInstances = instanceNames.length;

    System.err.print("reading " + numAttributes + " x " + numInstances + " table..");

    // Create an array to hold the data as we read it in...
    double dataArray[][] = new double[numAttributes][numInstances];

    // Populate the matrix with values...
    String valToken = "";
    try {
      int rowIdx = 0;
      while ((line = reader.readLine()) != null) {

        String[] tokens = line.split(delimiter, -1);
        attrNames[rowIdx] = tokens[0].trim();
        for (int colIdx = 0; colIdx < (tokens.length - 1); colIdx++) {
          valToken = tokens[colIdx + 1];
          double value;

          if (valToken.equals("null")) {
            value = Instance.missingValue();
          } else if (valToken.equals("?")) {
            value = Instance.missingValue();
          } else if (valToken.equals("NA")) {
            value = Instance.missingValue();
          } else if (valToken.equals("")) {
            value = Instance.missingValue();
            // }else value = DoubleParser.lightningParse(valToken); // faster double parser with
            // MANY assumptions
          } else value = Double.parseDouble(valToken);
          dataArray[rowIdx][colIdx] = value;
        }
        rowIdx++;
      }
    } catch (NumberFormatException e) {
      System.err.println(e.toString());
      System.err.println("Parsing line: " + line);
      System.err.println("Parsing token: " + valToken);
    }

    // Set up attributes, which for colInstances will be the rowNames...
    FastVector atts = new FastVector();
    for (int a = 0; a < numAttributes; a++) {
      atts.addElement(new Attribute(attrNames[a]));
    }

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    System.err.print("creating instances..");

    // System.err.println("DEBUG: numAttributes "+numAttributes);

    /** ***** CREATE INSTANCES ************* */
    // Fill the instances with data...
    // For each instance...
    for (int c = 0; c < numInstances; c++) {
      double[] vals =
          new double[data.numAttributes()]; // Even nominal values are stored as double pointers.

      for (int r = 0; r < numAttributes; r++) {
        double val = dataArray[r][c];
        vals[r] = val;
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    // System.err.println("DEBUG: data.numInstances: "+data.numInstances());
    // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes());
    // System.err.println("DEBUG: data.relationNAme"+data.relationName());
    System.err.print("add feature names..");

    /** ***** ADD FEATURE NAMES ************* */
    // takes basically zero time... all time is in previous 2 chunks.
    Instances newData = new Instances(data);
    newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
    int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

    for (int c = 0; c < numInstances; c++) {
      newData.instance(c).setValue(attrIdx, instanceNames[c]);
    }
    data = newData;

    // System.err.println("DEBUG: data.numInstances: "+data.numInstances());
    // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes());

    return (data);
  }