// from TestKea private void setOptions(String model) { // Name of the model -- give the path to the model ke.setModelName(model); // Name of the vocabulary -- name of the file (without extension) that is stored in // VOCABULARIES // or "none" if no Vocabulary is used (free keyphrase extraction). // ke.setVocabulary("agrovoc"); ke.setVocabulary("none"); // Optional arguments if you want to change the defaults // Encoding of the document ke.setEncoding("UTF-8"); // Language of the document -- use "es" for Spanish, "fr" for French // or other languages as specified in your "skos" vocabulary ke.setDocumentLanguage("en"); // es for Spanish, fr for French // Stemmer -- adjust if you use a different language than English or want to alterate results // (We have obtained better results for Spanish and French with NoStemmer) ke.setStemmer(new PorterStemmer()); // Stopwords ke.setStopwords(new StopwordsEnglish()); // Number of Keyphrases to extract ke.setNumPhrases(5); // Set to true, if you want to compute global dictionaries from the test collection ke.setBuildGlobal(false); }
private List<Instance> myExtractKeyphrases(String document, int numOfPhrases) throws Exception { // Check whether there is actually any data // if (document.length() == 0 || document.equals("")) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); List<Instance> myInstances = new ArrayList<Instance>(); double[] newInst = new double[2]; newInst[0] = (double) data.attribute(0).addStringValue(document); newInst[1] = Instance.missingValue(); data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); ke.setNumPhrases(numOfPhrases); int numPhrases = numOfPhrases; // ke.getNumPhrases(); Instance[] topRankedInstances = new Instance[numPhrases]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < numPhrases) { topRankedInstances[index] = inst; } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } myInstances.add(topRankedInstances[i]); } } return myInstances; }