public ArrayList<String> getNamedEntity(String sentence) { ArrayList<String> entities = new ArrayList<>(); List<Triple<String, Integer, Integer>> name2index = classifier.classifyToCharacterOffsets(sentence); for (Triple<String, Integer, Integer> name : name2index) { entities.add(sentence.substring(name.second(), name.third())); } return entities; }
public Map<String, NamedEntity> findNamedEntities(String sentence) { final Map<String, NamedEntity> namedEntities = new HashMap<>(); final List<Triple<String, Integer, Integer>> nerSubstrings = findNerSubstrings(sentence); for (final Triple<String, Integer, Integer> substring : nerSubstrings) { namedEntities.put( sentence.substring(substring.second(), substring.third()), NamedEntity.getNamedEntity(substring.first())); } return namedEntities; }
public String getRecallDescription(int numDigits, L label) { NumberFormat nf = NumberFormat.getNumberInstance(); nf.setMaximumFractionDigits(numDigits); Triple<Double, Integer, Integer> recall = getRecallInfo(label); return nf.format(recall.first()) + " (" + recall.second() + "/" + (recall.second() + recall.third()) + ")"; }
/** Returns a String summarizing overall accuracy that will print nicely. */ public String getAccuracyDescription(int numDigits) { NumberFormat nf = NumberFormat.getNumberInstance(); nf.setMaximumFractionDigits(numDigits); Triple<Double, Integer, Integer> accu = getAccuracyInfo(); return nf.format(accu.first()) + " (" + accu.second() + "/" + (accu.second() + accu.third()) + ")"; }
public String getPrecisionDescription(int numDigits, L label) { NumberFormat nf = NumberFormat.getNumberInstance(); nf.setMaximumFractionDigits(numDigits); Triple<Double, Integer, Integer> prec = getPrecisionInfo(label); return nf.format(prec.first()) + " (" + prec.second() + "/" + (prec.second() + prec.third()) + ")"; }
public static Set<String> analyzeThisFile(String path) { String serializedClassifier = "classifiers/english.conll.4class.distsim.crf.ser.gz"; Set<String> annotationsNE = new HashSet<String>(); try { BufferedReader br = new BufferedReader(new FileReader(path)); AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifier(serializedClassifier); String line = ""; while ((line = br.readLine()) != null) { List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(line); for (Triple<String, Integer, Integer> trip : triples) { annotationsNE.add(line.substring(trip.second(), trip.third())); } } System.out.println("Named Entity TROVATE"); for (String ne : annotationsNE) System.out.println(ne); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ClassCastException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return annotationsNE; }
private String getConllEvalString(List<L> orderedLabels, boolean ignoreNegLabel) { StringBuilder sb = new StringBuilder(); int correctPhrases = getCorrect() - getCorrect(negLabel); Triple<Double, Integer, Integer> accuracyInfo = getAccuracyInfo(); int totalCount = accuracyInfo.second() + accuracyInfo.third(); sb.append("processed " + totalCount + " tokens with " + getRelevant() + " phrases; "); sb.append("found: " + getRetrieved() + " phrases; correct: " + correctPhrases + "\n"); Formatter formatter = new Formatter(sb, Locale.US); formatter.format("accuracy: %6.2f%%; ", accuracyInfo.first() * 100); formatter.format("precision: %6.2f%%; ", getPrecision() * 100); formatter.format("recall: %6.2f%%; ", getRecall() * 100); formatter.format("FB1: %6.2f\n", getFMeasure() * 100); for (L label : orderedLabels) { if (ignoreNegLabel && label.equals(negLabel)) { continue; } formatter.format("%17s: ", label); formatter.format("precision: %6.2f%%; ", getPrecision(label) * 100); formatter.format("recall: %6.2f%%; ", getRecall(label) * 100); formatter.format("FB1: %6.2f %d\n", getFMeasure(label) * 100, getRetrieved(label)); } return sb.toString(); }
public static void main(String[] args) throws Exception { // String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz"; String serializedClassifier = "classifiers/english.muc.7class.distsim.crf.ser.gz"; if (args.length > 0) { serializedClassifier = args[0]; } AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifier(serializedClassifier); /* For either a file to annotate or for the hardcoded text example, this demo file shows several ways to process the input, for teaching purposes. */ if (args.length > 1) { /* For the file, it shows (1) how to run NER on a String, (2) how to get the entities in the String with character offsets, and (3) how to run NER on a whole file (without loading it into a String). */ String fileContents = IOUtils.slurpFile(args[1]); List<List<CoreLabel>> out = classifier.classify(fileContents); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { System.out.print( word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); } System.out.println(); } System.out.println("---"); out = classifier.classifyFile(args[1]); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { System.out.print( word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); } System.out.println(); } System.out.println("---"); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(fileContents); for (Triple<String, Integer, Integer> item : list) { // print entity/or non-entity - their nearby tokens System.out.println( item.first() + ": " + fileContents.substring(item.second(), item.third())); } System.out.println("---"); System.out.println("Ten best entity labelings"); DocumentReaderAndWriter<CoreLabel> readerAndWriter = classifier.makePlainTextReaderAndWriter(); classifier.classifyAndWriteAnswersKBest(args[1], 10, readerAndWriter); System.out.println("---"); System.out.println("Per-token marginalized probabilities"); classifier.printProbs(args[1], readerAndWriter); // -- This code prints out the first order (token pair) clique probabilities. // -- But that output is a bit overwhelming, so we leave it commented out by default. // System.out.println("---"); // System.out.println("First Order Clique Probabilities"); // ((CRFClassifier) classifier).printFirstOrderProbs(args[1], readerAndWriter); } else { /* For the hard-coded String, it shows how to run it on a single sentence, and how to do this and produce several formats, including slash tags and an inline XML output format. It also shows the full contents of the {@code CoreLabel}s that are constructed by the classifier. And it shows getting out the probabilities of different assignments and an n-best list of classifications with probabilities. */ String[] example = { "Good afternoon Rajat Raina, how are you today? I go to Washington DC on September 19. And Tomorrow.", "I go to school at Stanford University, which is located in California." }; for (String str : example) { System.out.println(classifier.classifyToString(str)); } System.out.println("---"); // ***sentence-by-sentence for (String str : example) { // This one puts in spaces and newlines between tokens, so just print not println. System.out.print(classifier.classifyToString(str, "slashTags", false)); } System.out.println("---"); // ***print: entities + Classes + remaining text in the text for (String str : example) { // This one is best for dealing with the output as a TSV (tab-separated column) file. // The first column gives entities, the second their classes, and the third the remaining // text in a document System.out.print(classifier.classifyToString(str, "tabbedEntities", false)); } System.out.println("---"); for (String str : example) { System.out.println(classifier.classifyWithInlineXML(str)); } System.out.println("---"); for (String str : example) { System.out.println(classifier.classifyToString(str, "xml", true)); } System.out.println("---"); for (String str : example) { System.out.print(classifier.classifyToString(str, "tsv", false)); } System.out.println("---"); // This gets out entities with character offsets System.out.print("character offsets"); int j = 0; for (String str : example) { j++; List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(str); for (Triple<String, Integer, Integer> trip : triples) { System.out.printf( "%s over character offsets [%d, %d) in sentence %d.%n", trip.first(), trip.second(), trip.third, j); } } System.out.println("---"); // This prints out all the details of what is stored for each token int i = 0; for (String str : example) { for (List<CoreLabel> lcl : classifier.classify(str)) { for (CoreLabel cl : lcl) { System.out.print(i++ + ": "); System.out.println(cl.toShorterString()); } } } System.out.println("---"); } }
public void train(Collection<Pair<Document, List<Entity>>> trainingData) { startTrack("Training"); // --Variables RVFDataset<Boolean, Feature> dataset = new RVFDataset<Boolean, Feature>(); LinearClassifierFactory<Boolean, Feature> fact = new LinearClassifierFactory<Boolean, Feature>(); // --Feature Extraction startTrack("Feature Extraction"); for (Pair<Document, List<Entity>> datum : trainingData) { // (document variables) Document doc = datum.getFirst(); List<Entity> goldClusters = datum.getSecond(); List<Mention> mentions = doc.getMentions(); Map<Mention, Entity> goldEntities = Entity.mentionToEntityMap(goldClusters); startTrack("Document " + doc.id); // (for each mention...) for (int i = 0; i < mentions.size(); i++) { // (get the mention and its cluster) Mention onPrix = mentions.get(i); Entity source = goldEntities.get(onPrix); if (source == null) { throw new IllegalArgumentException("Mention has no gold entity: " + onPrix); } // (for each previous mention...) int oldSize = dataset.size(); for (int j = i - 1; j >= 0; j--) { // (get previous mention and its cluster) Mention cand = mentions.get(j); Entity target = goldEntities.get(cand); if (target == null) { throw new IllegalArgumentException("Mention has no gold entity: " + cand); } // (extract features) Counter<Feature> feats = extractor.extractFeatures(Pair.make(onPrix, cand.markCoreferent(target))); // (add datum) dataset.add(new RVFDatum<Boolean, Feature>(feats, target == source)); // (stop if if (target == source) { break; } } // logf("Mention %s (%d datums)", onPrix.toString(), dataset.size() - oldSize); } endTrack("Document " + doc.id); } endTrack("Feature Extraction"); // --Train Classifier startTrack("Minimizer"); this.classifier = fact.trainClassifier(dataset); endTrack("Minimizer"); // --Dump Weights startTrack("Features"); // (get labels to print) Set<Boolean> labels = new HashSet<Boolean>(); labels.add(true); // (print features) for (Triple<Feature, Boolean, Double> featureInfo : this.classifier.getTopFeatures(labels, 0.0, true, 100, true)) { Feature feature = featureInfo.first(); Boolean label = featureInfo.second(); Double magnitude = featureInfo.third(); // log(FORCE,new DecimalFormat("0.000").format(magnitude) + " [" + label + "] " + feature); } end_Track("Features"); endTrack("Training"); }
/** * The core implementation of the search. * * @param root The root word to search from. Traditionally, this is the root of the sentence. * @param candidateFragments The callback for the resulting sentence fragments. This is a * predicate of a triple of values. The return value of the predicate determines whether we * should continue searching. The triple is a triple of * <ol> * <li>The log probability of the sentence fragment, according to the featurizer and the * weights * <li>The features along the path to this fragment. The last element of this is the * features from the most recent step. * <li>The sentence fragment. Because it is relatively expensive to compute the resulting * tree, this is returned as a lazy {@link Supplier}. * </ol> * * @param classifier The classifier for whether an arc should be on the path to a clause split, a * clause split itself, or neither. * @param featurizer The featurizer to use. Make sure this matches the weights! * @param actionSpace The action space we are allowed to take. Each action defines a means of * splitting a clause on a dependency boundary. */ protected void search( // The root to search from IndexedWord root, // The output specs final Predicate<Triple<Double, List<Counter<String>>, Supplier<SentenceFragment>>> candidateFragments, // The learning specs final Classifier<ClauseSplitter.ClauseClassifierLabel, String> classifier, Map<String, ? extends List<String>> hardCodedSplits, final Function<Triple<State, Action, State>, Counter<String>> featurizer, final Collection<Action> actionSpace, final int maxTicks) { // (the fringe) PriorityQueue<Pair<State, List<Counter<String>>>> fringe = new FixedPrioritiesPriorityQueue<>(); // (avoid duplicate work) Set<IndexedWord> seenWords = new HashSet<>(); State firstState = new State(null, null, -9000, null, x -> {}, true); // First state is implicitly "done" fringe.add(Pair.makePair(firstState, new ArrayList<>(0)), -0.0); int ticks = 0; while (!fringe.isEmpty()) { if (++ticks > maxTicks) { // System.err.println("WARNING! Timed out on search with " + ticks + " ticks"); return; } // Useful variables double logProbSoFar = fringe.getPriority(); assert logProbSoFar <= 0.0; Pair<State, List<Counter<String>>> lastStatePair = fringe.removeFirst(); State lastState = lastStatePair.first; List<Counter<String>> featuresSoFar = lastStatePair.second; IndexedWord rootWord = lastState.edge == null ? root : lastState.edge.getDependent(); // Register thunk if (lastState.isDone) { if (!candidateFragments.test( Triple.makeTriple( logProbSoFar, featuresSoFar, () -> { SemanticGraph copy = new SemanticGraph(tree); lastState .thunk .andThen( x -> { // Add the extra edges back in, if they don't break the tree-ness of the // extraction for (IndexedWord newTreeRoot : x.getRoots()) { if (newTreeRoot != null) { // what a strange thing to have happen... for (SemanticGraphEdge extraEdge : extraEdgesByGovernor.get(newTreeRoot)) { assert Util.isTree(x); //noinspection unchecked addSubtree( x, newTreeRoot, extraEdge.getRelation().toString(), tree, extraEdge.getDependent(), tree.getIncomingEdgesSorted(newTreeRoot)); assert Util.isTree(x); } } } }) .accept(copy); return new SentenceFragment(copy, assumedTruth, false); }))) { break; } } // Find relevant auxilliary terms SemanticGraphEdge subjOrNull = null; SemanticGraphEdge objOrNull = null; for (SemanticGraphEdge auxEdge : tree.outgoingEdgeIterable(rootWord)) { String relString = auxEdge.getRelation().toString(); if (relString.contains("obj")) { objOrNull = auxEdge; } else if (relString.contains("subj")) { subjOrNull = auxEdge; } } // Iterate over children // For each outgoing edge... for (SemanticGraphEdge outgoingEdge : tree.outgoingEdgeIterable(rootWord)) { // Prohibit indirect speech verbs from splitting off clauses // (e.g., 'said', 'think') // This fires if the governor is an indirect speech verb, and the outgoing edge is a ccomp if (outgoingEdge.getRelation().toString().equals("ccomp") && ((outgoingEdge.getGovernor().lemma() != null && INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().lemma())) || INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().word()))) { continue; } // Get some variables String outgoingEdgeRelation = outgoingEdge.getRelation().toString(); List<String> forcedArcOrder = hardCodedSplits.get(outgoingEdgeRelation); if (forcedArcOrder == null && outgoingEdgeRelation.contains(":")) { forcedArcOrder = hardCodedSplits.get( outgoingEdgeRelation.substring(0, outgoingEdgeRelation.indexOf(":")) + ":*"); } boolean doneForcedArc = false; // For each action... for (Action action : (forcedArcOrder == null ? actionSpace : orderActions(actionSpace, forcedArcOrder))) { // Check the prerequisite if (!action.prerequisitesMet(tree, outgoingEdge)) { continue; } if (forcedArcOrder != null && doneForcedArc) { break; } // 1. Compute the child state Optional<State> candidate = action.applyTo(tree, lastState, outgoingEdge, subjOrNull, objOrNull); if (candidate.isPresent()) { double logProbability; ClauseClassifierLabel bestLabel; Counter<String> features = featurizer.apply(Triple.makeTriple(lastState, action, candidate.get())); if (forcedArcOrder != null && !doneForcedArc) { logProbability = 0.0; bestLabel = ClauseClassifierLabel.CLAUSE_SPLIT; doneForcedArc = true; } else if (features.containsKey("__undocumented_junit_no_classifier")) { logProbability = Double.NEGATIVE_INFINITY; bestLabel = ClauseClassifierLabel.CLAUSE_INTERM; } else { Counter<ClauseClassifierLabel> scores = classifier.scoresOf(new RVFDatum<>(features)); if (scores.size() > 0) { Counters.logNormalizeInPlace(scores); } String rel = outgoingEdge.getRelation().toString(); if ("nsubj".equals(rel) || "dobj".equals(rel)) { scores.remove( ClauseClassifierLabel.NOT_A_CLAUSE); // Always at least yield on nsubj and dobj } logProbability = Counters.max(scores, Double.NEGATIVE_INFINITY); bestLabel = Counters.argmax(scores, (x, y) -> 0, ClauseClassifierLabel.CLAUSE_SPLIT); } if (bestLabel != ClauseClassifierLabel.NOT_A_CLAUSE) { Pair<State, List<Counter<String>>> childState = Pair.makePair( candidate.get().withIsDone(bestLabel), new ArrayList<Counter<String>>(featuresSoFar) { { add(features); } }); // 2. Register the child state if (!seenWords.contains(childState.first.edge.getDependent())) { // System.err.println(" pushing " + action.signature() + " with " + // argmax.first.edge); fringe.add(childState, logProbability); } } } } } seenWords.add(rootWord); } // System.err.println("Search finished in " + ticks + " ticks and " + classifierEvals + " // classifier evaluations."); }