/** * Saves the singleton predictor model to the given filename. If there is an error, a * RuntimeIOException is thrown. */ public void saveToSerialized(LogisticClassifier<String, String> predictor, String filename) { try { log.info("Writing singleton predictor in serialized format to file " + filename + ' '); ObjectOutputStream out = IOUtils.writeStreamFromString(filename); out.writeObject(predictor); out.close(); log.info("done."); } catch (IOException ioe) { throw new RuntimeIOException(ioe); } }
/** This hashCode uses only the docID, sentenceIndex, and index. See compareTo for more info. */ @Override public int hashCode() { if (cachedHashCode != 0) { return cachedHashCode; } boolean sensible = false; int result = 0; if (get(CoreAnnotations.DocIDAnnotation.class) != null) { result = get(CoreAnnotations.DocIDAnnotation.class).hashCode(); sensible = true; } if (containsKey(CoreAnnotations.SentenceIndexAnnotation.class)) { result = 29 * result + get(CoreAnnotations.SentenceIndexAnnotation.class).hashCode(); sensible = true; } if (containsKey(CoreAnnotations.IndexAnnotation.class)) { result = 29 * result + get(CoreAnnotations.IndexAnnotation.class).hashCode(); sensible = true; } if (!sensible) { log.info( "WARNING!!! You have hashed an IndexedWord with no docID, sentIndex or wordIndex. You will almost certainly lose"); } cachedHashCode = result; return result; }
public static void main(String[] args) throws Exception { Properties props = null; if (args.length > 0) props = StringUtils.argsToProperties(args); if (!props.containsKey("dcoref.conll2011")) { log.info("-dcoref.conll2011 [input_CoNLL_corpus]: was not specified"); return; } if (!props.containsKey("singleton.predictor.output")) { log.info("-singleton.predictor.output [output_model_file]: was not specified"); return; } SingletonPredictor predictor = new SingletonPredictor(); GeneralDataset<String, String> data = predictor.generateFeatureVectors(props); LogisticClassifier<String, String> classifier = predictor.train(data); predictor.saveToSerialized(classifier, props.getProperty("singleton.predictor.output")); }
// should be able to pass in a comparator! protected static double precision(Set<?> s1, Set<?> s2) { double n = 0.0; double p = 0.0; for (Object o1 : s1) { if (s2.contains(o1)) { p += 1.0; } if (DEBUG) { if (s2.contains(o1)) { log.info("Eval Found: " + o1); } else { log.info("Eval Failed to find: " + o1); } } n += 1.0; } if (DEBUG) log.info("Matched " + p + " of " + n); return (n > 0.0 ? p / n : 0.0); }
public Annotation process(String sentence, String dateString, Annotator timeAnnotator) { log.info("Processing text \"" + sentence + "\" with dateString = " + dateString); Annotation anno = new Annotation(sentence); if (dateString != null && !dateString.equals("")) { anno.set(CoreAnnotations.DocDateAnnotation.class, dateString); } pipeline.annotate(anno); timeAnnotator.annotate(anno); return anno; }
public float accuracy(Iterator<RVFDatum<L, F>> exampleIterator) { int correct = 0; int total = 0; for (; exampleIterator.hasNext(); ) { RVFDatum<L, F> next = exampleIterator.next(); L guess = classOf(next); if (guess.equals(next.label())) { correct++; } total++; } logger.info("correct " + correct + " out of " + total); return correct / (float) total; }
private static Set<String> readDict(String filename, boolean normalize) { Set<String> word = Generics.newHashSet(); logger.info( "Loading " + (normalize ? "normalized" : "unnormalized") + " dictionary from " + filename); try { InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename); BufferedReader wordDetectorReader = new BufferedReader(new InputStreamReader(is, "UTF-8")); int i = 0; for (String wordDetectorLine; (wordDetectorLine = wordDetectorReader.readLine()) != null; ) { i++; // String[] fields = wordDetectorLine.split(" "); // logger.debug("DEBUG: "+filename+" "+wordDetectorLine); int origLeng = wordDetectorLine.length(); wordDetectorLine = wordDetectorLine.trim(); int newLeng = wordDetectorLine.length(); if (newLeng != origLeng) { EncodingPrintWriter.err.println( "Line " + i + " of " + filename + " has leading/trailing whitespace: |" + wordDetectorLine + "|", "UTF-8"); } if (newLeng == 0) { EncodingPrintWriter.err.println("Line " + i + " of " + filename + " is empty", "UTF-8"); } else { if (normalize) { wordDetectorLine = ChineseUtils.normalize( wordDetectorLine, ChineseUtils.ASCII, ChineseUtils.ASCII, ChineseUtils.NORMALIZE); } word.add(wordDetectorLine); } } is.close(); } catch (IOException e) { throw new RuntimeIOException(e); } return word; }
public static void main(String[] args) { if (args.length < minArgs) { System.out.println(usage()); System.exit(-1); } Properties options = StringUtils.argsToProperties(args, argDefs()); Language language = PropertiesUtils.get(options, "l", Language.English, Language.class); TreebankLangParserParams tlpp = language.params; DiskTreebank tb = null; String encoding = options.getProperty("l", "UTF-8"); boolean removeBracket = PropertiesUtils.getBool(options, "b", false); tlpp.setInputEncoding(encoding); tlpp.setOutputEncoding(encoding); tb = tlpp.diskTreebank(); String[] files = options.getProperty("", "").split("\\s+"); if (files.length != 0) { for (String filename : files) { tb.loadPath(filename); } } else { log.info(usage()); System.exit(-1); } PrintWriter pwo = tlpp.pw(); String startSymbol = tlpp.treebankLanguagePack().startSymbol(); TreeFactory tf = new LabeledScoredTreeFactory(); int nTrees = 0; for (Tree t : tb) { if (removeBracket) { if (t.value().equals(startSymbol)) { t = t.firstChild(); } } else if (!t.value().equals(startSymbol)) { // Add a bracket if it isn't already there t = tf.newTreeNode(startSymbol, Collections.singletonList(t)); } pwo.println(t.toString()); nTrees++; } pwo.close(); System.err.printf("Processed %d trees.%n", nTrees); }
public static <L, F> OneVsAllClassifier<L, F> train( ClassifierFactory<String, F, Classifier<String, F>> classifierFactory, GeneralDataset<L, F> dataset, Collection<L> trainLabels) { Index<L> labelIndex = dataset.labelIndex(); Index<F> featureIndex = dataset.featureIndex(); Map<L, Classifier<String, F>> classifiers = Generics.newHashMap(); for (L label : trainLabels) { int i = labelIndex.indexOf(label); logger.info("Training " + label + " = " + i + ", posIndex = " + posIndex); // Create training data for training this classifier Map<L, String> posLabelMap = new ArrayMap<>(); posLabelMap.put(label, POS_LABEL); GeneralDataset<String, F> binaryDataset = dataset.mapDataset(dataset, binaryIndex, posLabelMap, NEG_LABEL); Classifier<String, F> binaryClassifier = classifierFactory.trainClassifier(binaryDataset); classifiers.put(label, binaryClassifier); } OneVsAllClassifier<L, F> classifier = new OneVsAllClassifier<>(featureIndex, labelIndex, classifiers); return classifier; }
@Override public void initializeTraining( Options op, Lexicon lex, Index<String> wordIndex, Index<String> tagIndex, double totalTrees) { super.initializeTraining(op, lex, wordIndex, tagIndex, totalTrees); this.indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting); seenCounter = new ClassicCounter<>(); unSeenCounter = new ClassicCounter<>(); model = new EnglishUnknownWordModel(op, lex, wordIndex, tagIndex, unSeenCounter); // scan data if (DOCUMENT_UNKNOWNS) { log.info( "Collecting " + Lexicon.UNKNOWN_WORD + " from trees " + (indexToStartUnkCounting + 1) + " to " + totalTrees); } }
public void evaluate(Tree guess, Tree gold, PrintWriter pw, double weight) { if (DEBUG) { log.info("Evaluating gold tree:"); gold.pennPrint(System.err); log.info("and guess tree"); guess.pennPrint(System.err); } Set<?> dep1 = makeObjects(guess); Set<?> dep2 = makeObjects(gold); final double curPrecision = precision(dep1, dep2); final double curRecall = precision(dep2, dep1); curF1 = (curPrecision > 0.0 && curRecall > 0.0 ? 2.0 / (1.0 / curPrecision + 1.0 / curRecall) : 0.0); precision += curPrecision * weight; recall += curRecall * weight; f1 += curF1 * weight; num += weight; precision2 += dep1.size() * curPrecision * weight; pnum2 += dep1.size() * weight; recall2 += dep2.size() * curRecall * weight; rnum2 += dep2.size() * weight; if (curF1 > 0.9999) { exact += 1.0; } if (pw != null) { pw.print(" P: " + ((int) (curPrecision * 10000)) / 100.0); if (runningAverages) { pw.println( " (sent ave " + ((int) (precision * 10000 / num)) / 100.0 + ") (evalb " + ((int) (precision2 * 10000 / pnum2)) / 100.0 + ")"); } pw.print(" R: " + ((int) (curRecall * 10000)) / 100.0); if (runningAverages) { pw.print( " (sent ave " + ((int) (recall * 10000 / num)) / 100.0 + ") (evalb " + ((int) (recall2 * 10000 / rnum2)) / 100.0 + ")"); } pw.println(); double cF1 = 2.0 / (rnum2 / recall2 + pnum2 / precision2); pw.print(str + " F1: " + ((int) (curF1 * 10000)) / 100.0); if (runningAverages) { pw.print( " (sent ave " + ((int) (10000 * f1 / num)) / 100.0 + ", evalb " + ((int) (10000 * cF1)) / 100.0 + ") Exact: " + ((int) (10000 * exact / num)) / 100.0); } // pw.println(" N: " + getNum()); pw.println(" N: " + num); } /* Sentence s = guess.yield(); for (Object obj : s) { if (curF1 < 0.7) { badwords.incrementCount(obj); } else { goodwords.incrementCount(obj); } } */ }
/** * Configure all parameters for converting a list of tokens into sentences. The whole enchilada. * * @param boundaryTokenRegex Tokens that match this regex will end a sentence, but are retained at * the end of the sentence. Substantive value must be supplied. * @param boundaryFollowersRegex This is a Set of String that are matched with .equals() which are * allowed to be tacked onto the end of a sentence after a sentence boundary token, for * example ")". Substantive value must be supplied. * @param boundariesToDiscard This is normally used for newline tokens if they are included in the * tokenization. They may end the sentence (depending on the setting of * newlineIsSentenceBreak), but at any rate are deleted from sentences in the output. * Substantive value must be supplied. * @param xmlBreakElementsToDiscard These are elements like "p" or "sent", which will be wrapped * into regex for approximate XML matching. They will be deleted in the output, and will * always trigger a sentence boundary. May be null; means discard none. * @param regionElementRegex XML element name regex to delimit regions processed. Tokens outside * one of these elements are discarded. May be null; means to not filter by regions * @param newlineIsSentenceBreak How to treat newlines. Must have substantive value. * @param sentenceBoundaryMultiTokenPattern A TokensRegex multi-token pattern for finding * boundaries. May be null; means that there are no such patterns. * @param tokenRegexesToDiscard Regex for tokens to discard. May be null; means that no tokens are * discarded in this way. * @param isOneSentence Whether to treat whole of input as one sentence regardless. Must have * substantive value. Overrides anything else. * @param allowEmptySentences Whether to allow empty sentences to be output Must have substantive * value. Often suppressed, but don't want that in things like strict one-sentence-per-line * mode. */ public WordToSentenceProcessor( String boundaryTokenRegex, String boundaryFollowersRegex, Set<String> boundariesToDiscard, Set<String> xmlBreakElementsToDiscard, String regionElementRegex, NewlineIsSentenceBreak newlineIsSentenceBreak, SequencePattern<? super IN> sentenceBoundaryMultiTokenPattern, Set<String> tokenRegexesToDiscard, boolean isOneSentence, boolean allowEmptySentences) { sentenceBoundaryTokenPattern = Pattern.compile(boundaryTokenRegex); sentenceBoundaryFollowersPattern = Pattern.compile(boundaryFollowersRegex); sentenceBoundaryToDiscard = Collections.unmodifiableSet(boundariesToDiscard); if (xmlBreakElementsToDiscard == null || xmlBreakElementsToDiscard.isEmpty()) { this.xmlBreakElementsToDiscard = null; } else { this.xmlBreakElementsToDiscard = new ArrayList<>(xmlBreakElementsToDiscard.size()); for (String s : xmlBreakElementsToDiscard) { String regex = "<\\s*(?:/\\s*)?(?:" + s + ")(?:\\s+[^>]+?|\\s*(?:/\\s*)?)>"; // log.info("Regex is |" + regex + "|"); // todo: Historically case insensitive, but maybe better and more proper to make case // sensitive? this.xmlBreakElementsToDiscard.add(Pattern.compile(regex, Pattern.CASE_INSENSITIVE)); } } if (regionElementRegex != null) { sentenceRegionBeginPattern = Pattern.compile("<\\s*(?:" + regionElementRegex + ")(?:\\s+[^>]+?)?>"); sentenceRegionEndPattern = Pattern.compile("<\\s*/\\s*(?:" + regionElementRegex + ")\\s*>"); } else { sentenceRegionBeginPattern = null; sentenceRegionEndPattern = null; } this.newlineIsSentenceBreak = newlineIsSentenceBreak; this.sentenceBoundaryMultiTokenPattern = sentenceBoundaryMultiTokenPattern; if (tokenRegexesToDiscard != null) { this.tokenPatternsToDiscard = new ArrayList<>(tokenRegexesToDiscard.size()); for (String s : tokenRegexesToDiscard) { this.tokenPatternsToDiscard.add(Pattern.compile(s)); } } else { this.tokenPatternsToDiscard = null; } this.isOneSentence = isOneSentence; this.allowEmptySentences = allowEmptySentences; if (DEBUG) { log.info("WordToSentenceProcessor: boundaryTokens=" + boundaryTokenRegex); log.info(" boundaryFollowers=" + boundaryFollowersRegex); log.info(" boundariesToDiscard=" + boundariesToDiscard); log.info(" xmlBreakElementsToDiscard=" + xmlBreakElementsToDiscard); log.info(" regionBeginPattern=" + sentenceRegionBeginPattern); log.info(" regionEndPattern=" + sentenceRegionEndPattern); log.info(" newlineIsSentenceBreak=" + newlineIsSentenceBreak); log.info(" sentenceBoundaryMultiTokenPattern=" + sentenceBoundaryMultiTokenPattern); log.info(" tokenPatternsToDiscard=" + tokenPatternsToDiscard); log.info(" isOneSentence=" + isOneSentence); log.info(" allowEmptySentences=" + allowEmptySentences); } }
/** * Returns a List of Lists where each element is built from a run of Words in the input Document. * Specifically, reads through each word in the input document and breaks off a sentence after * finding a valid sentence boundary token or end of file. Note that for this to work, the words * in the input document must have been tokenized with a tokenizer that makes sentence boundary * tokens their own tokens (e.g., {@link PTBTokenizer}). * * @param words A list of already tokenized words (must implement HasWord or be a String). * @return A list of sentences. * @see #WordToSentenceProcessor(String, String, Set, Set, String, NewlineIsSentenceBreak, * SequencePattern, Set, boolean, boolean) */ public List<List<IN>> wordsToSentences(List<? extends IN> words) { IdentityHashMap<Object, Boolean> isSentenceBoundary = null; // is null unless used by sentenceBoundaryMultiTokenPattern if (sentenceBoundaryMultiTokenPattern != null) { // Do initial pass using tokensregex to identify multi token patterns that need to be matched // and add the last token to our table of sentence boundary tokens isSentenceBoundary = new IdentityHashMap<>(); SequenceMatcher<? super IN> matcher = sentenceBoundaryMultiTokenPattern.getMatcher(words); while (matcher.find()) { List nodes = matcher.groupNodes(); if (nodes != null && !nodes.isEmpty()) { isSentenceBoundary.put(nodes.get(nodes.size() - 1), true); } } } // Split tokens into sentences!!! List<List<IN>> sentences = Generics.newArrayList(); List<IN> currentSentence = new ArrayList<>(); List<IN> lastSentence = null; boolean insideRegion = false; boolean inWaitForForcedEnd = false; boolean lastTokenWasNewline = false; for (IN o : words) { String word = getString(o); boolean forcedEnd = isForcedEndToken(o); boolean inMultiTokenExpr = false; boolean discardToken = false; if (o instanceof CoreMap) { // Hacky stuff to ensure sentence breaks do not happen in certain cases CoreMap cm = (CoreMap) o; Boolean forcedUntilEndValue = cm.get(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class); if (!forcedEnd) { if (forcedUntilEndValue != null && forcedUntilEndValue) inWaitForForcedEnd = true; else { MultiTokenTag mt = cm.get(CoreAnnotations.MentionTokenAnnotation.class); if (mt != null && !mt.isEnd()) { // In the middle of a multi token mention, make sure sentence is not ended here inMultiTokenExpr = true; } } } } if (tokenPatternsToDiscard != null) { discardToken = matchesTokenPatternsToDiscard(word); } if (sentenceRegionBeginPattern != null && !insideRegion) { if (DEBUG) { log.info("Word is " + word + "; outside region; deleted"); } if (sentenceRegionBeginPattern.matcher(word).matches()) { insideRegion = true; if (DEBUG) { log.info(" entering region"); } } lastTokenWasNewline = false; continue; } if (lastSentence != null && currentSentence.isEmpty() && sentenceBoundaryFollowersPattern.matcher(word).matches()) { if (!discardToken) { lastSentence.add(o); } if (DEBUG) { log.info("Word is " + word + (discardToken ? "discarded" : " added to last sentence")); } lastTokenWasNewline = false; continue; } boolean newSent = false; String debugText = (discardToken) ? "discarded" : "added to current"; if (inWaitForForcedEnd && !forcedEnd) { if (!discardToken) currentSentence.add(o); if (DEBUG) { log.info("Word is " + word + "; is in wait for forced end; " + debugText); } } else if (inMultiTokenExpr && !forcedEnd) { if (!discardToken) currentSentence.add(o); if (DEBUG) { log.info("Word is " + word + "; is in multi token expr; " + debugText); } } else if (sentenceBoundaryToDiscard.contains(word)) { if (newlineIsSentenceBreak == NewlineIsSentenceBreak.ALWAYS) { newSent = true; } else if (newlineIsSentenceBreak == NewlineIsSentenceBreak.TWO_CONSECUTIVE) { if (lastTokenWasNewline) { newSent = true; } } lastTokenWasNewline = true; if (DEBUG) { log.info("Word is " + word + " discarded sentence boundary"); } } else { lastTokenWasNewline = false; Boolean isb; if (xmlBreakElementsToDiscard != null && matchesXmlBreakElementToDiscard(word)) { newSent = true; if (DEBUG) { log.info("Word is " + word + "; is XML break element; discarded"); } } else if (sentenceRegionEndPattern != null && sentenceRegionEndPattern.matcher(word).matches()) { insideRegion = false; newSent = true; // Marked sentence boundaries } else if ((isSentenceBoundary != null) && ((isb = isSentenceBoundary.get(o)) != null) && isb) { if (!discardToken) currentSentence.add(o); if (DEBUG) { log.info( "Word is " + word + "; is sentence boundary (matched multi-token pattern); " + debugText); } newSent = true; } else if (sentenceBoundaryTokenPattern.matcher(word).matches()) { if (!discardToken) currentSentence.add(o); if (DEBUG) { log.info("Word is " + word + "; is sentence boundary; " + debugText); } newSent = true; } else if (forcedEnd) { if (!discardToken) currentSentence.add(o); inWaitForForcedEnd = false; newSent = true; if (DEBUG) { log.info("Word is " + word + "; annotated to be the end of a sentence; " + debugText); } } else { if (!discardToken) currentSentence.add(o); if (DEBUG) { log.info("Word is " + word + "; " + debugText); } } } if (newSent && (!currentSentence.isEmpty() || allowEmptySentences)) { if (DEBUG) { log.info(" beginning new sentence"); } sentences.add(currentSentence); // adds this sentence now that it's complete lastSentence = currentSentence; currentSentence = new ArrayList<>(); // clears the current sentence } } // add any words at the end, even if there isn't a sentence // terminator at the end of file if (!currentSentence.isEmpty()) { sentences.add(currentSentence); // adds last sentence } return sentences; }