/** * Build the associator on the filtered data. * * @param data the training data * @throws Exception if the Associator could not be built successfully */ public void buildAssociations(Instances data) throws Exception { if (m_Associator == null) throw new Exception("No base associator has been set!"); // create copy and set class-index data = new Instances(data); if (getClassIndex() == 0) { data.setClassIndex(data.numAttributes() - 1); } else { data.setClassIndex(getClassIndex() - 1); } if (getClassIndex() != -1) { // remove instances with missing class data.deleteWithMissingClass(); } m_Filter.setInputFormat(data); // filter capabilities are checked here data = Filter.useFilter(data, m_Filter); // can associator handle the data? getAssociator().getCapabilities().testWithFail(data); m_FilteredInstances = data.stringFreeStructure(); m_Associator.buildAssociations(data); }
private List<Instance> myExtractKeyphrases(String document, int numOfPhrases) throws Exception { // Check whether there is actually any data // if (document.length() == 0 || document.equals("")) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); List<Instance> myInstances = new ArrayList<Instance>(); double[] newInst = new double[2]; newInst[0] = (double) data.attribute(0).addStringValue(document); newInst[1] = Instance.missingValue(); data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); ke.setNumPhrases(numOfPhrases); int numPhrases = numOfPhrases; // ke.getNumPhrases(); Instance[] topRankedInstances = new Instance[numPhrases]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < numPhrases) { topRankedInstances[index] = inst; } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } myInstances.add(topRankedInstances[i]); } } return myInstances; }
/** 分类过程 */ public double classifyMessage(String message) throws Exception { filter.input(makeInstance(message, instances.stringFreeStructure())); Instance filteredInstance = filter.output(); // 必须使用原来的filter double predicted = classifier.classifyInstance(filteredInstance); // (int)predicted是类标索引 // System.out.println("Message classified as : " // + instances.classAttribute().value((int) predicted)); return predicted; }
/** * Sets the format of the input instances. If the filter is able to determine the output format * before seeing any input instances, it does so here. This default implementation clears the * output format and output queue, and the new batch flag is set. Overriders should call <code> * super.setInputFormat(Instances)</code> * * @param instanceInfo an Instances object containing the input instance structure (any instances * contained in the object are ignored - only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the inputFormat can't be set successfully */ public boolean setInputFormat(Instances instanceInfo) throws Exception { testInputFormat(instanceInfo); m_InputFormat = instanceInfo.stringFreeStructure(); m_OutputFormat = null; m_OutputQueue = new Queue(); m_NewBatch = true; m_FirstBatchDone = false; initInputLocators(m_InputFormat, null); return false; }
/** * This will remove all buffered instances from the inputformat dataset. Use this method rather * than getInputFormat().delete(); */ protected void flushInput() { if ((m_InputStringAtts.getAttributeIndices().length > 0) || (m_InputRelAtts.getAttributeIndices().length > 0)) { m_InputFormat = m_InputFormat.stringFreeStructure(); m_InputStringAtts = new StringLocator(m_InputFormat, m_InputStringAtts.getAllowedIndices()); m_InputRelAtts = new RelationalLocator(m_InputFormat, m_InputRelAtts.getAllowedIndices()); } else { // This more efficient than new Instances(m_InputFormat, 0); m_InputFormat.delete(); } }
/** * Sets the format of output instances. The derived class should use this method once it has * determined the outputformat. The output queue is cleared. * * @param outputFormat the new output format */ protected void setOutputFormat(Instances outputFormat) { if (outputFormat != null) { m_OutputFormat = outputFormat.stringFreeStructure(); initOutputLocators(m_OutputFormat, null); // Rename the relation String relationName = outputFormat.relationName() + "-" + this.getClass().getName(); if (this instanceof OptionHandler) { String[] options = ((OptionHandler) this).getOptions(); for (int i = 0; i < options.length; i++) { relationName += options[i].trim(); } } m_OutputFormat.setRelationName(relationName); } else { m_OutputFormat = null; } m_OutputQueue = new Queue(); }
/** * Signify that this batch of input to the filter is finished. If the filter requires all * instances prior to filtering, output() may now be called to retrieve the filtered instances. * Any subsequent instances filtered should be filtered based on setting obtained from the first * batch (unless the inputFormat has been re-assigned or new options have been set). This default * implementation assumes all instance processing occurs during inputFormat() and input(). * * @return true if there are instances pending output * @throws NullPointerException if no input structure has been defined, * @throws Exception if there was a problem finishing the batch. */ public boolean batchFinished() throws Exception { if (m_InputFormat == null) { throw new NullPointerException("No input instance format defined"); } flushInput(); m_NewBatch = true; m_FirstBatchDone = true; if (m_OutputQueue.empty()) { // Clear out references to old strings/relationals occasionally if ((m_OutputStringAtts.getAttributeIndices().length > 0) || (m_OutputRelAtts.getAttributeIndices().length > 0)) { m_OutputFormat = m_OutputFormat.stringFreeStructure(); m_OutputStringAtts = new StringLocator(m_OutputFormat, m_OutputStringAtts.getAllowedIndices()); } } return (numPendingOutput() != 0); }
/** Builds the model from the files */ public void extractKeyphrases(Hashtable stems) throws Exception { Vector stats = new Vector(); // Check whether there is actually any data if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(2); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Extract keyphrases Enumeration elem = stems.keys(); while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); Reader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); Reader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("No keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (m_debug) { System.err.println("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == topRankedInstances[i] .attribute(topRankedInstances[i].numAttributes() - 1) .indexOfValue("True")) { numCorrect += 1.0; } if (printer != null) { printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print( Utils.doubleToString( topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { System.err.println(topRankedInstances[i]); } } } if (numExtracted > 0) { if (m_debug) { System.err.println("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); System.err.println( "Avg. number of correct keyphrases: " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); System.err.println("Based on " + stats.size() + " documents"); m_KEAFilter.batchFinished(); }
/** Builds the model from the training data */ public void buildModel(HashSet<String> fileNames) throws Exception { // Check whether there is actually any data if (fileNames.size() == 0) { throw new Exception("Couldn't find any data in " + inputDirectoryName); } System.err.println("-- Building the model... "); FastVector atts = new FastVector(3); atts.addElement(new Attribute("filename", (FastVector) null)); atts.addElement(new Attribute("document", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Build model mauiFilter = new MauiFilter(); mauiFilter.setDebug(getDebug()); mauiFilter.setMaxPhraseLength(getMaxPhraseLength()); mauiFilter.setMinPhraseLength(getMinPhraseLength()); mauiFilter.setMinNumOccur(getMinNumOccur()); mauiFilter.setStemmer(getStemmer()); mauiFilter.setDocumentLanguage(getDocumentLanguage()); mauiFilter.setVocabularyName(getVocabularyName()); mauiFilter.setVocabularyFormat(getVocabularyFormat()); mauiFilter.setStopwords(getStopwords()); if (wikipedia != null) { mauiFilter.setWikipedia(wikipedia); } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) { mauiFilter.setWikipedia(wikipedia); } else { mauiFilter.setWikipedia( wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory); } if (classifier != null) { mauiFilter.setClassifier(classifier); } mauiFilter.setInputFormat(data); // set features configurations mauiFilter.setBasicFeatures(useBasicFeatures); mauiFilter.setKeyphrasenessFeature(useKeyphrasenessFeature); mauiFilter.setFrequencyFeatures(useFrequencyFeatures); mauiFilter.setPositionsFeatures(usePositionsFeatures); mauiFilter.setLengthFeature(useLengthFeature); mauiFilter.setThesaurusFeatures(useNodeDegreeFeature); mauiFilter.setBasicWikipediaFeatures(useBasicWikipediaFeatures); mauiFilter.setAllWikipediaFeatures(useAllWikipediaFeatures); mauiFilter.setThesaurusFeatures(useNodeDegreeFeature); mauiFilter.setClassifier(classifier); mauiFilter.setContextSize(contextSize); mauiFilter.setMinKeyphraseness(minKeyphraseness); mauiFilter.setMinSenseProbability(minSenseProbability); if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) { mauiFilter.loadThesaurus(getStemmer(), getStopwords()); } System.err.println("-- Reading the input documents... "); for (String fileName : fileNames) { double[] newInst = new double[3]; newInst[0] = (double) data.attribute(0).addStringValue(fileName); ; File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt"); File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key"); try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTextFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTextFile)); } // Reading the file content StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } is.close(); // Adding the text of the document to the instance newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString()); } catch (Exception e) { System.err.println("Problem with reading " + documentTextFile); e.printStackTrace(); newInst[1] = Instance.missingValue(); } try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTopicsFile)); } // Reading the content of the keyphrase file StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } // Adding the topics to the file newInst[2] = (double) data.attribute(2).addStringValue(keyStr.toString()); } catch (Exception e) { System.err.println("Problem with reading " + documentTopicsFile); e.printStackTrace(); newInst[2] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); mauiFilter.input(data.instance(0)); data = data.stringFreeStructure(); } mauiFilter.batchFinished(); while ((mauiFilter.output()) != null) {} ; }