예제 #1
0
  /**
   * Build the associator on the filtered data.
   *
   * @param data the training data
   * @throws Exception if the Associator could not be built successfully
   */
  public void buildAssociations(Instances data) throws Exception {
    if (m_Associator == null) throw new Exception("No base associator has been set!");

    // create copy and set class-index
    data = new Instances(data);
    if (getClassIndex() == 0) {
      data.setClassIndex(data.numAttributes() - 1);
    } else {
      data.setClassIndex(getClassIndex() - 1);
    }

    if (getClassIndex() != -1) {
      // remove instances with missing class
      data.deleteWithMissingClass();
    }

    m_Filter.setInputFormat(data); // filter capabilities are checked here
    data = Filter.useFilter(data, m_Filter);

    // can associator handle the data?
    getAssociator().getCapabilities().testWithFail(data);

    m_FilteredInstances = data.stringFreeStructure();
    m_Associator.buildAssociations(data);
  }
예제 #2
0
  private List<Instance> myExtractKeyphrases(String document, int numOfPhrases) throws Exception {

    // Check whether there is actually any data
    //
    if (document.length() == 0 || document.equals("")) {
      throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    List<Instance> myInstances = new ArrayList<Instance>();

    double[] newInst = new double[2];
    newInst[0] = (double) data.attribute(0).addStringValue(document);
    newInst[1] = Instance.missingValue();

    data.add(new Instance(1.0, newInst));

    m_KEAFilter.input(data.instance(0));

    data = data.stringFreeStructure();

    ke.setNumPhrases(numOfPhrases);

    int numPhrases = numOfPhrases; // ke.getNumPhrases();

    Instance[] topRankedInstances = new Instance[numPhrases];
    Instance inst;

    // Iterating over all extracted keyphrases (inst)
    while ((inst = m_KEAFilter.output()) != null) {
      int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;

      if (index < numPhrases) {
        topRankedInstances[index] = inst;
      }
    }

    double numExtracted = 0, numCorrect = 0;

    for (int i = 0; i < numPhrases; i++) {
      if (topRankedInstances[i] != null) {
        if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
          numExtracted += 1.0;
        }
        if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
          numCorrect += 1.0;
        }
        myInstances.add(topRankedInstances[i]);
      }
    }

    return myInstances;
  }
  /** 分类过程 */
  public double classifyMessage(String message) throws Exception {

    filter.input(makeInstance(message, instances.stringFreeStructure()));
    Instance filteredInstance = filter.output(); // 必须使用原来的filter

    double predicted = classifier.classifyInstance(filteredInstance); // (int)predicted是类标索引
    //        System.out.println("Message classified as : "
    //                + instances.classAttribute().value((int) predicted));
    return predicted;
  }
예제 #4
0
  /**
   * Sets the format of the input instances. If the filter is able to determine the output format
   * before seeing any input instances, it does so here. This default implementation clears the
   * output format and output queue, and the new batch flag is set. Overriders should call <code>
   * super.setInputFormat(Instances)</code>
   *
   * @param instanceInfo an Instances object containing the input instance structure (any instances
   *     contained in the object are ignored - only the structure is required).
   * @return true if the outputFormat may be collected immediately
   * @throws Exception if the inputFormat can't be set successfully
   */
  public boolean setInputFormat(Instances instanceInfo) throws Exception {

    testInputFormat(instanceInfo);

    m_InputFormat = instanceInfo.stringFreeStructure();
    m_OutputFormat = null;
    m_OutputQueue = new Queue();
    m_NewBatch = true;
    m_FirstBatchDone = false;
    initInputLocators(m_InputFormat, null);
    return false;
  }
예제 #5
0
  /**
   * This will remove all buffered instances from the inputformat dataset. Use this method rather
   * than getInputFormat().delete();
   */
  protected void flushInput() {

    if ((m_InputStringAtts.getAttributeIndices().length > 0)
        || (m_InputRelAtts.getAttributeIndices().length > 0)) {
      m_InputFormat = m_InputFormat.stringFreeStructure();
      m_InputStringAtts = new StringLocator(m_InputFormat, m_InputStringAtts.getAllowedIndices());
      m_InputRelAtts = new RelationalLocator(m_InputFormat, m_InputRelAtts.getAllowedIndices());
    } else {
      // This more efficient than new Instances(m_InputFormat, 0);
      m_InputFormat.delete();
    }
  }
예제 #6
0
  /**
   * Sets the format of output instances. The derived class should use this method once it has
   * determined the outputformat. The output queue is cleared.
   *
   * @param outputFormat the new output format
   */
  protected void setOutputFormat(Instances outputFormat) {

    if (outputFormat != null) {
      m_OutputFormat = outputFormat.stringFreeStructure();
      initOutputLocators(m_OutputFormat, null);

      // Rename the relation
      String relationName = outputFormat.relationName() + "-" + this.getClass().getName();
      if (this instanceof OptionHandler) {
        String[] options = ((OptionHandler) this).getOptions();
        for (int i = 0; i < options.length; i++) {
          relationName += options[i].trim();
        }
      }
      m_OutputFormat.setRelationName(relationName);
    } else {
      m_OutputFormat = null;
    }
    m_OutputQueue = new Queue();
  }
예제 #7
0
  /**
   * Signify that this batch of input to the filter is finished. If the filter requires all
   * instances prior to filtering, output() may now be called to retrieve the filtered instances.
   * Any subsequent instances filtered should be filtered based on setting obtained from the first
   * batch (unless the inputFormat has been re-assigned or new options have been set). This default
   * implementation assumes all instance processing occurs during inputFormat() and input().
   *
   * @return true if there are instances pending output
   * @throws NullPointerException if no input structure has been defined,
   * @throws Exception if there was a problem finishing the batch.
   */
  public boolean batchFinished() throws Exception {

    if (m_InputFormat == null) {
      throw new NullPointerException("No input instance format defined");
    }
    flushInput();
    m_NewBatch = true;
    m_FirstBatchDone = true;

    if (m_OutputQueue.empty()) {
      // Clear out references to old strings/relationals occasionally
      if ((m_OutputStringAtts.getAttributeIndices().length > 0)
          || (m_OutputRelAtts.getAttributeIndices().length > 0)) {
        m_OutputFormat = m_OutputFormat.stringFreeStructure();
        m_OutputStringAtts =
            new StringLocator(m_OutputFormat, m_OutputStringAtts.getAllowedIndices());
      }
    }

    return (numPendingOutput() != 0);
  }
  /** Builds the model from the files */
  public void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    if (stems.size() == 0) {
      throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Extract keyphrases
    Enumeration elem = stems.keys();
    while (elem.hasMoreElements()) {
      String str = (String) elem.nextElement();
      double[] newInst = new double[2];
      try {
        File txt = new File(m_dirName + "/" + str + ".txt");
        Reader is;
        if (!m_encoding.equals("default")) {
          is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding);
        } else {
          is = new BomStrippingInputStreamReader(new FileInputStream(txt));
        }
        StringBuffer txtStr = new StringBuffer();
        int c;
        while ((c = is.read()) != -1) {
          txtStr.append((char) c);
        }
        newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
      } catch (Exception e) {
        if (m_debug) {
          System.err.println("Can't read document " + str + ".txt");
        }
        newInst[0] = Instance.missingValue();
      }
      try {
        File key = new File(m_dirName + "/" + str + ".key");
        Reader is;
        if (!m_encoding.equals("default")) {
          is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding);
        } else {
          is = new BomStrippingInputStreamReader(new FileInputStream(key));
        }
        StringBuffer keyStr = new StringBuffer();
        int c;
        while ((c = is.read()) != -1) {
          keyStr.append((char) c);
        }
        newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
      } catch (Exception e) {
        if (m_debug) {
          System.err.println("No keyphrases for stem " + str + ".");
        }
        newInst[1] = Instance.missingValue();
      }
      data.add(new Instance(1.0, newInst));
      m_KEAFilter.input(data.instance(0));
      data = data.stringFreeStructure();
      if (m_debug) {
        System.err.println("-- Document: " + str);
      }
      Instance[] topRankedInstances = new Instance[m_numPhrases];
      Instance inst;
      while ((inst = m_KEAFilter.output()) != null) {
        int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;
        if (index < m_numPhrases) {
          topRankedInstances[index] = inst;
        }
      }
      if (m_debug) {
        System.err.println("-- Keyphrases and feature values:");
      }
      FileOutputStream out = null;
      PrintWriter printer = null;
      File key = new File(m_dirName + "/" + str + ".key");
      if (!key.exists()) {
        out = new FileOutputStream(m_dirName + "/" + str + ".key");
        if (!m_encoding.equals("default")) {
          printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));
        } else {
          printer = new PrintWriter(out);
        }
      }
      double numExtracted = 0, numCorrect = 0;
      for (int i = 0; i < m_numPhrases; i++) {
        if (topRankedInstances[i] != null) {
          if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
            numExtracted += 1.0;
          }
          if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1)
              == topRankedInstances[i]
                  .attribute(topRankedInstances[i].numAttributes() - 1)
                  .indexOfValue("True")) {
            numCorrect += 1.0;
          }
          if (printer != null) {
            printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));
            if (m_AdditionalInfo) {
              printer.print("\t");
              printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
              printer.print("\t");
              printer.print(
                  Utils.doubleToString(
                      topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
            }
            printer.println();
          }
          if (m_debug) {
            System.err.println(topRankedInstances[i]);
          }
        }
      }
      if (numExtracted > 0) {
        if (m_debug) {
          System.err.println("-- " + numCorrect + " correct");
        }
        stats.addElement(new Double(numCorrect));
      }
      if (printer != null) {
        printer.flush();
        printer.close();
        out.close();
      }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
      st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));
    System.err.println(
        "Avg. number of correct keyphrases: "
            + Utils.doubleToString(avg, 2)
            + " +/- "
            + Utils.doubleToString(stdDev, 2));
    System.err.println("Based on " + stats.size() + " documents");
    m_KEAFilter.batchFinished();
  }
예제 #9
0
  /** Builds the model from the training data */
  public void buildModel(HashSet<String> fileNames) throws Exception {

    // Check whether there is actually any data
    if (fileNames.size() == 0) {
      throw new Exception("Couldn't find any data in " + inputDirectoryName);
    }

    System.err.println("-- Building the model... ");

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("filename", (FastVector) null));
    atts.addElement(new Attribute("document", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Build model
    mauiFilter = new MauiFilter();

    mauiFilter.setDebug(getDebug());
    mauiFilter.setMaxPhraseLength(getMaxPhraseLength());
    mauiFilter.setMinPhraseLength(getMinPhraseLength());
    mauiFilter.setMinNumOccur(getMinNumOccur());
    mauiFilter.setStemmer(getStemmer());
    mauiFilter.setDocumentLanguage(getDocumentLanguage());
    mauiFilter.setVocabularyName(getVocabularyName());
    mauiFilter.setVocabularyFormat(getVocabularyFormat());
    mauiFilter.setStopwords(getStopwords());

    if (wikipedia != null) {
      mauiFilter.setWikipedia(wikipedia);
    } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) {
      mauiFilter.setWikipedia(wikipedia);
    } else {
      mauiFilter.setWikipedia(
          wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory);
    }

    if (classifier != null) {
      mauiFilter.setClassifier(classifier);
    }

    mauiFilter.setInputFormat(data);

    // set features configurations
    mauiFilter.setBasicFeatures(useBasicFeatures);
    mauiFilter.setKeyphrasenessFeature(useKeyphrasenessFeature);
    mauiFilter.setFrequencyFeatures(useFrequencyFeatures);
    mauiFilter.setPositionsFeatures(usePositionsFeatures);
    mauiFilter.setLengthFeature(useLengthFeature);
    mauiFilter.setThesaurusFeatures(useNodeDegreeFeature);
    mauiFilter.setBasicWikipediaFeatures(useBasicWikipediaFeatures);
    mauiFilter.setAllWikipediaFeatures(useAllWikipediaFeatures);
    mauiFilter.setThesaurusFeatures(useNodeDegreeFeature);

    mauiFilter.setClassifier(classifier);

    mauiFilter.setContextSize(contextSize);
    mauiFilter.setMinKeyphraseness(minKeyphraseness);
    mauiFilter.setMinSenseProbability(minSenseProbability);

    if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) {
      mauiFilter.loadThesaurus(getStemmer(), getStopwords());
    }

    System.err.println("-- Reading the input documents... ");

    for (String fileName : fileNames) {

      double[] newInst = new double[3];

      newInst[0] = (double) data.attribute(0).addStringValue(fileName);
      ;

      File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt");
      File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key");

      try {

        InputStreamReader is;
        if (!documentEncoding.equals("default")) {
          is = new InputStreamReader(new FileInputStream(documentTextFile), documentEncoding);
        } else {
          is = new InputStreamReader(new FileInputStream(documentTextFile));
        }

        // Reading the file content
        StringBuffer txtStr = new StringBuffer();
        int c;
        while ((c = is.read()) != -1) {
          txtStr.append((char) c);
        }
        is.close();

        // Adding the text of the document to the instance
        newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString());

      } catch (Exception e) {

        System.err.println("Problem with reading " + documentTextFile);
        e.printStackTrace();
        newInst[1] = Instance.missingValue();
      }

      try {

        InputStreamReader is;
        if (!documentEncoding.equals("default")) {
          is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding);
        } else {
          is = new InputStreamReader(new FileInputStream(documentTopicsFile));
        }

        // Reading the content of the keyphrase file
        StringBuffer keyStr = new StringBuffer();
        int c;
        while ((c = is.read()) != -1) {
          keyStr.append((char) c);
        }

        // Adding the topics to the file
        newInst[2] = (double) data.attribute(2).addStringValue(keyStr.toString());

      } catch (Exception e) {

        System.err.println("Problem with reading " + documentTopicsFile);
        e.printStackTrace();
        newInst[2] = Instance.missingValue();
      }

      data.add(new Instance(1.0, newInst));

      mauiFilter.input(data.instance(0));
      data = data.stringFreeStructure();
    }
    mauiFilter.batchFinished();

    while ((mauiFilter.output()) != null) {}
    ;
  }