public String getPrecisionDescription(int numDigits, L label) { NumberFormat nf = NumberFormat.getNumberInstance(); nf.setMaximumFractionDigits(numDigits); Triple<Double, Integer, Integer> prec = getPrecisionInfo(label); return nf.format(prec.first()) + " (" + prec.second() + "/" + (prec.second() + prec.third()) + ")"; }
public String getRecallDescription(int numDigits, L label) { NumberFormat nf = NumberFormat.getNumberInstance(); nf.setMaximumFractionDigits(numDigits); Triple<Double, Integer, Integer> recall = getRecallInfo(label); return nf.format(recall.first()) + " (" + recall.second() + "/" + (recall.second() + recall.third()) + ")"; }
/** Returns a String summarizing overall accuracy that will print nicely. */ public String getAccuracyDescription(int numDigits) { NumberFormat nf = NumberFormat.getNumberInstance(); nf.setMaximumFractionDigits(numDigits); Triple<Double, Integer, Integer> accu = getAccuracyInfo(); return nf.format(accu.first()) + " (" + accu.second() + "/" + (accu.second() + accu.third()) + ")"; }
public ArrayList<String> getNamedEntity(String sentence) { ArrayList<String> entities = new ArrayList<>(); List<Triple<String, Integer, Integer>> name2index = classifier.classifyToCharacterOffsets(sentence); for (Triple<String, Integer, Integer> name : name2index) { entities.add(sentence.substring(name.second(), name.third())); } return entities; }
public Map<String, NamedEntity> findNamedEntities(String sentence) { final Map<String, NamedEntity> namedEntities = new HashMap<>(); final List<Triple<String, Integer, Integer>> nerSubstrings = findNerSubstrings(sentence); for (final Triple<String, Integer, Integer> substring : nerSubstrings) { namedEntities.put( sentence.substring(substring.second(), substring.third()), NamedEntity.getNamedEntity(substring.first())); } return namedEntities; }
public static Set<String> analyzeThisFile(String path) { String serializedClassifier = "classifiers/english.conll.4class.distsim.crf.ser.gz"; Set<String> annotationsNE = new HashSet<String>(); try { BufferedReader br = new BufferedReader(new FileReader(path)); AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifier(serializedClassifier); String line = ""; while ((line = br.readLine()) != null) { List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(line); for (Triple<String, Integer, Integer> trip : triples) { annotationsNE.add(line.substring(trip.second(), trip.third())); } } System.out.println("Named Entity TROVATE"); for (String ne : annotationsNE) System.out.println(ne); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ClassCastException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return annotationsNE; }
private String getConllEvalString(List<L> orderedLabels, boolean ignoreNegLabel) { StringBuilder sb = new StringBuilder(); int correctPhrases = getCorrect() - getCorrect(negLabel); Triple<Double, Integer, Integer> accuracyInfo = getAccuracyInfo(); int totalCount = accuracyInfo.second() + accuracyInfo.third(); sb.append("processed " + totalCount + " tokens with " + getRelevant() + " phrases; "); sb.append("found: " + getRetrieved() + " phrases; correct: " + correctPhrases + "\n"); Formatter formatter = new Formatter(sb, Locale.US); formatter.format("accuracy: %6.2f%%; ", accuracyInfo.first() * 100); formatter.format("precision: %6.2f%%; ", getPrecision() * 100); formatter.format("recall: %6.2f%%; ", getRecall() * 100); formatter.format("FB1: %6.2f\n", getFMeasure() * 100); for (L label : orderedLabels) { if (ignoreNegLabel && label.equals(negLabel)) { continue; } formatter.format("%17s: ", label); formatter.format("precision: %6.2f%%; ", getPrecision(label) * 100); formatter.format("recall: %6.2f%%; ", getRecall(label) * 100); formatter.format("FB1: %6.2f %d\n", getFMeasure(label) * 100, getRetrieved(label)); } return sb.toString(); }
public static void main(String[] args) throws Exception { // String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz"; String serializedClassifier = "classifiers/english.muc.7class.distsim.crf.ser.gz"; if (args.length > 0) { serializedClassifier = args[0]; } AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifier(serializedClassifier); /* For either a file to annotate or for the hardcoded text example, this demo file shows several ways to process the input, for teaching purposes. */ if (args.length > 1) { /* For the file, it shows (1) how to run NER on a String, (2) how to get the entities in the String with character offsets, and (3) how to run NER on a whole file (without loading it into a String). */ String fileContents = IOUtils.slurpFile(args[1]); List<List<CoreLabel>> out = classifier.classify(fileContents); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { System.out.print( word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); } System.out.println(); } System.out.println("---"); out = classifier.classifyFile(args[1]); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { System.out.print( word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); } System.out.println(); } System.out.println("---"); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(fileContents); for (Triple<String, Integer, Integer> item : list) { // print entity/or non-entity - their nearby tokens System.out.println( item.first() + ": " + fileContents.substring(item.second(), item.third())); } System.out.println("---"); System.out.println("Ten best entity labelings"); DocumentReaderAndWriter<CoreLabel> readerAndWriter = classifier.makePlainTextReaderAndWriter(); classifier.classifyAndWriteAnswersKBest(args[1], 10, readerAndWriter); System.out.println("---"); System.out.println("Per-token marginalized probabilities"); classifier.printProbs(args[1], readerAndWriter); // -- This code prints out the first order (token pair) clique probabilities. // -- But that output is a bit overwhelming, so we leave it commented out by default. // System.out.println("---"); // System.out.println("First Order Clique Probabilities"); // ((CRFClassifier) classifier).printFirstOrderProbs(args[1], readerAndWriter); } else { /* For the hard-coded String, it shows how to run it on a single sentence, and how to do this and produce several formats, including slash tags and an inline XML output format. It also shows the full contents of the {@code CoreLabel}s that are constructed by the classifier. And it shows getting out the probabilities of different assignments and an n-best list of classifications with probabilities. */ String[] example = { "Good afternoon Rajat Raina, how are you today? I go to Washington DC on September 19. And Tomorrow.", "I go to school at Stanford University, which is located in California." }; for (String str : example) { System.out.println(classifier.classifyToString(str)); } System.out.println("---"); // ***sentence-by-sentence for (String str : example) { // This one puts in spaces and newlines between tokens, so just print not println. System.out.print(classifier.classifyToString(str, "slashTags", false)); } System.out.println("---"); // ***print: entities + Classes + remaining text in the text for (String str : example) { // This one is best for dealing with the output as a TSV (tab-separated column) file. // The first column gives entities, the second their classes, and the third the remaining // text in a document System.out.print(classifier.classifyToString(str, "tabbedEntities", false)); } System.out.println("---"); for (String str : example) { System.out.println(classifier.classifyWithInlineXML(str)); } System.out.println("---"); for (String str : example) { System.out.println(classifier.classifyToString(str, "xml", true)); } System.out.println("---"); for (String str : example) { System.out.print(classifier.classifyToString(str, "tsv", false)); } System.out.println("---"); // This gets out entities with character offsets System.out.print("character offsets"); int j = 0; for (String str : example) { j++; List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(str); for (Triple<String, Integer, Integer> trip : triples) { System.out.printf( "%s over character offsets [%d, %d) in sentence %d.%n", trip.first(), trip.second(), trip.third, j); } } System.out.println("---"); // This prints out all the details of what is stored for each token int i = 0; for (String str : example) { for (List<CoreLabel> lcl : classifier.classify(str)) { for (CoreLabel cl : lcl) { System.out.print(i++ + ": "); System.out.println(cl.toShorterString()); } } } System.out.println("---"); } }
public void train(Collection<Pair<Document, List<Entity>>> trainingData) { startTrack("Training"); // --Variables RVFDataset<Boolean, Feature> dataset = new RVFDataset<Boolean, Feature>(); LinearClassifierFactory<Boolean, Feature> fact = new LinearClassifierFactory<Boolean, Feature>(); // --Feature Extraction startTrack("Feature Extraction"); for (Pair<Document, List<Entity>> datum : trainingData) { // (document variables) Document doc = datum.getFirst(); List<Entity> goldClusters = datum.getSecond(); List<Mention> mentions = doc.getMentions(); Map<Mention, Entity> goldEntities = Entity.mentionToEntityMap(goldClusters); startTrack("Document " + doc.id); // (for each mention...) for (int i = 0; i < mentions.size(); i++) { // (get the mention and its cluster) Mention onPrix = mentions.get(i); Entity source = goldEntities.get(onPrix); if (source == null) { throw new IllegalArgumentException("Mention has no gold entity: " + onPrix); } // (for each previous mention...) int oldSize = dataset.size(); for (int j = i - 1; j >= 0; j--) { // (get previous mention and its cluster) Mention cand = mentions.get(j); Entity target = goldEntities.get(cand); if (target == null) { throw new IllegalArgumentException("Mention has no gold entity: " + cand); } // (extract features) Counter<Feature> feats = extractor.extractFeatures(Pair.make(onPrix, cand.markCoreferent(target))); // (add datum) dataset.add(new RVFDatum<Boolean, Feature>(feats, target == source)); // (stop if if (target == source) { break; } } // logf("Mention %s (%d datums)", onPrix.toString(), dataset.size() - oldSize); } endTrack("Document " + doc.id); } endTrack("Feature Extraction"); // --Train Classifier startTrack("Minimizer"); this.classifier = fact.trainClassifier(dataset); endTrack("Minimizer"); // --Dump Weights startTrack("Features"); // (get labels to print) Set<Boolean> labels = new HashSet<Boolean>(); labels.add(true); // (print features) for (Triple<Feature, Boolean, Double> featureInfo : this.classifier.getTopFeatures(labels, 0.0, true, 100, true)) { Feature feature = featureInfo.first(); Boolean label = featureInfo.second(); Double magnitude = featureInfo.third(); // log(FORCE,new DecimalFormat("0.000").format(magnitude) + " [" + label + "] " + feature); } end_Track("Features"); endTrack("Training"); }