Ejemplo n.º 1
0
  // from TestKea
  private void setOptions(String model) {

    //  Name of the model -- give the path to the model
    ke.setModelName(model);

    //  Name of the vocabulary -- name of the file (without extension) that is stored in
    // VOCABULARIES
    //    or "none" if no Vocabulary is used (free keyphrase extraction).
    // ke.setVocabulary("agrovoc");
    ke.setVocabulary("none");

    // Optional arguments if you want to change the defaults

    // Encoding of the document
    ke.setEncoding("UTF-8");

    // Language of the document -- use "es" for Spanish, "fr" for French
    //    or other languages as specified in your "skos" vocabulary
    ke.setDocumentLanguage("en"); // es for Spanish, fr for French

    // Stemmer -- adjust if you use a different language than English or want to alterate results
    // (We have obtained better results for Spanish and French with NoStemmer)
    ke.setStemmer(new PorterStemmer());

    // Stopwords
    ke.setStopwords(new StopwordsEnglish());

    // Number of Keyphrases to extract
    ke.setNumPhrases(5);

    // Set to true, if you want to compute global dictionaries from the test collection
    ke.setBuildGlobal(false);
  }
Ejemplo n.º 2
0
  private List<Instance> myExtractKeyphrases(String document, int numOfPhrases) throws Exception {

    // Check whether there is actually any data
    //
    if (document.length() == 0 || document.equals("")) {
      throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    List<Instance> myInstances = new ArrayList<Instance>();

    double[] newInst = new double[2];
    newInst[0] = (double) data.attribute(0).addStringValue(document);
    newInst[1] = Instance.missingValue();

    data.add(new Instance(1.0, newInst));

    m_KEAFilter.input(data.instance(0));

    data = data.stringFreeStructure();

    ke.setNumPhrases(numOfPhrases);

    int numPhrases = numOfPhrases; // ke.getNumPhrases();

    Instance[] topRankedInstances = new Instance[numPhrases];
    Instance inst;

    // Iterating over all extracted keyphrases (inst)
    while ((inst = m_KEAFilter.output()) != null) {
      int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;

      if (index < numPhrases) {
        topRankedInstances[index] = inst;
      }
    }

    double numExtracted = 0, numCorrect = 0;

    for (int i = 0; i < numPhrases; i++) {
      if (topRankedInstances[i] != null) {
        if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
          numExtracted += 1.0;
        }
        if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
          numCorrect += 1.0;
        }
        myInstances.add(topRankedInstances[i]);
      }
    }

    return myInstances;
  }