/** * Create NFA, DFA and generate code for grammar. Create NFA for any delegates first. Once all NFA * are created, it's ok to create DFA, which must check for left-recursion. That check is done by * walking the full NFA, which therefore must be complete. After all NFA, comes DFA conversion for * root grammar then code gen for root grammar. DFA and code gen for delegates comes next. */ protected void generateRecognizer(Grammar grammar) { String language = (String) grammar.getOption("language"); if (language != null) { CodeGenerator generator = new CodeGenerator(this, grammar, language); grammar.setCodeGenerator(generator); generator.setDebug(isDebug()); generator.setProfile(isProfile()); generator.setTrace(isTrace()); // generate NFA early in case of crash later (for debugging) if (isGenerate_NFA_dot()) { generateNFAs(grammar); } // GENERATE CODE generator.genRecognizer(); if (isGenerate_DFA_dot()) { generateDFAs(grammar); } List<Grammar> delegates = grammar.getDirectDelegates(); for (int i = 0; delegates != null && i < delegates.size(); i++) { Grammar delegate = (Grammar) delegates.get(i); if (delegate != grammar) { // already processing this one generateRecognizer(delegate); } } } }
ATN createATN(Grammar g) { if (g.atn != null) return g.atn; semanticProcess(g); ParserATNFactory f = new ParserATNFactory(g); if (g.isLexer()) f = new LexerATNFactory((LexerGrammar) g); g.atn = f.createATN(); return g.atn; }
protected void semanticProcess(Grammar g) { if (g.ast != null && !g.ast.hasErrors) { System.out.println(g.ast.toStringTree()); Tool antlr = new Tool(); SemanticPipeline sem = new SemanticPipeline(g); sem.process(); if (g.getImportedGrammars() != null) { // process imported grammars (if any) for (Grammar imp : g.getImportedGrammars()) { antlr.processNonCombinedGrammar(imp, false); } } } }
public String toString() { return "GrammarCorpus, based on " + grammar.getClass() + "-grammar, " + sentences + " sentences "; }
/** * This method is used by all code generators to create new output files. If the outputDir set by * -o is not present it will be created. The final filename is sensitive to the output directory * and the directory where the grammar file was found. If -o is /tmp and the original grammar file * was foo/t.g then output files go in /tmp/foo. * * <p>The output dir -o spec takes precedence if it's absolute. E.g., if the grammar file dir is * absolute the output dir is given precendence. "-o /tmp /usr/lib/t.g" results in "/tmp/T.java" * as output (assuming t.g holds T.java). * * <p>If no -o is specified, then just write to the directory where the grammar file was found. * * <p>If outputDirectory==null then write a String. */ public Writer getOutputFile(Grammar g, String fileName) throws IOException { if (getOutputDirectory() == null) { return new StringWriter(); } // output directory is a function of where the grammar file lives // for subdir/T.g, you get subdir here. Well, depends on -o etc... // But, if this is a .tokens file, then we force the output to // be the base output directory (or current directory if there is not a -o) // File outputDir; if (fileName.endsWith(CodeGenerator.VOCAB_FILE_EXTENSION)) { if (haveOutputDir) { outputDir = new File(getOutputDirectory()); } else { outputDir = new File("."); } } else { outputDir = getOutputDirectory(g.getFileName()); } File outputFile = new File(outputDir, fileName); if (!outputDir.exists()) { outputDir.mkdirs(); } FileWriter fw = new FileWriter(outputFile); return new BufferedWriter(fw); }
protected void generateNFAs(Grammar g) { DOTGenerator dotGenerator = new DOTGenerator(g); Collection rules = g.getAllImportedRules(); rules.addAll(g.getRules()); for (Iterator itr = rules.iterator(); itr.hasNext(); ) { Rule r = (Rule) itr.next(); try { String dot = dotGenerator.getDOT(r.startState); if (dot != null) { writeDOTFile(g, r, dot); } } catch (IOException ioe) { ErrorManager.error(ErrorManager.MSG_CANNOT_WRITE_FILE, ioe); } } }
public int expand(Unit unit, Grammar grammar, Rule parent, List replacement) { Symbol whiteSymbol = grammar.declared(this); if (whiteSymbol == null) return -1; replacement.add(whiteSymbol); return replacement.size() - 1; }
List<ANTLRMessage> checkRuleDFA(String gtext, String ruleName, String expecting) throws Exception { ErrorQueue equeue = new ErrorQueue(); Grammar g = new Grammar(gtext, equeue); ATN atn = createATN(g); ATNState s = atn.ruleToStartState[g.getRule(ruleName).index]; if (s == null) { System.err.println("no such rule: " + ruleName); return null; } ATNState t = s.transition(0).target; if (!(t instanceof DecisionState)) { System.out.println(ruleName + " has no decision"); return null; } DecisionState blk = (DecisionState) t; checkRuleDFA(g, blk, expecting); return equeue.all; }
List<Integer> getTypesFromString(Grammar g, String expecting) { List<Integer> expectingTokenTypes = new ArrayList<Integer>(); if (expecting != null && !expecting.trim().equals("")) { for (String tname : expecting.replace(" ", "").split(",")) { int ttype = g.getTokenType(tname); expectingTokenTypes.add(ttype); } } return expectingTokenTypes; }
public void generateDFAs(Grammar g) { for (int d = 1; d <= g.getNumberOfDecisions(); d++) { DFA dfa = g.getLookaheadDFA(d); if (dfa == null) { continue; // not there for some reason, ignore } DOTGenerator dotGenerator = new DOTGenerator(g); String dot = dotGenerator.getDOT(dfa.startState); String dotFileName = g.name + "." + "dec-" + d; if (g.implicitLexer) { dotFileName = g.name + Grammar.grammarTypeToFileNameSuffix[g.type] + "." + "dec-" + d; } try { writeDOTFile(g, dotFileName, dot); } catch (IOException ioe) { ErrorManager.error(ErrorManager.MSG_CANNOT_GEN_DOT_FILE, dotFileName, ioe); } } }
/** Get a grammar mentioned on the command-line and any delegates */ public Grammar getRootGrammar(String grammarFileName) throws IOException { // StringTemplate.setLintMode(true); // grammars mentioned on command line are either roots or single grammars. // create the necessary composite in case it's got delegates; even // single grammar needs it to get token types. CompositeGrammar composite = new CompositeGrammar(); Grammar grammar = new Grammar(this, grammarFileName, composite); composite.setDelegationRoot(grammar); FileReader fr = null; File f = null; if (haveInputDir) { f = new File(inputDirectory, grammarFileName); } else { f = new File(grammarFileName); } // Store the location of this grammar as if we import files, we can then // search for imports in the same location as the original grammar as well as in // the lib directory. // parentGrammarDirectory = f.getParent(); if (grammarFileName.lastIndexOf(File.separatorChar) == -1) { grammarOutputDirectory = "."; } else { grammarOutputDirectory = grammarFileName.substring(0, grammarFileName.lastIndexOf(File.separatorChar)); } fr = new FileReader(f); BufferedReader br = new BufferedReader(fr); grammar.parseAndBuildAST(br); composite.watchNFAConversion = internalOption_watchNFAConversion; br.close(); fr.close(); return grammar; }
/** * @param previousGrammar * @param previousLexicon * @param grammar * @param lexicon * @param trainStateSetTrees * @return */ public static double doOneEStep( Grammar previousGrammar, Lexicon previousLexicon, Grammar grammar, Lexicon lexicon, StateSetTreeList trainStateSetTrees, boolean updateOnlyLexicon, int unkThreshold) { boolean secondHalf = false; ArrayParser parser = new ArrayParser(previousGrammar, previousLexicon); double trainingLikelihood = 0; int n = 0; int nTrees = trainStateSetTrees.size(); for (Tree<StateSet> stateSetTree : trainStateSetTrees) { secondHalf = (n++ > nTrees / 2.0); boolean noSmoothing = true, debugOutput = false; parser.doInsideOutsideScores(stateSetTree, noSmoothing, debugOutput); // E Step double ll = stateSetTree.getLabel().getIScore(0); ll = Math.log(ll) + (100 * stateSetTree.getLabel().getIScale()); // System.out.println(stateSetTree); if ((Double.isInfinite(ll) || Double.isNaN(ll))) { if (VERBOSE) { System.out.println("Training sentence " + n + " is given " + ll + " log likelihood!"); System.out.println( "Root iScore " + stateSetTree.getLabel().getIScore(0) + " scale " + stateSetTree.getLabel().getIScale()); } } else { lexicon.trainTree(stateSetTree, -1, previousLexicon, secondHalf, noSmoothing, unkThreshold); if (!updateOnlyLexicon) grammar.tallyStateSetTree(stateSetTree, previousGrammar); // E Step trainingLikelihood += ll; // there are for some reason some sentences that are unparsable } } lexicon.tieRareWordStats(unkThreshold); // SSIE ((SophisticatedLexicon) lexicon).overwriteWithMaxent(); return trainingLikelihood; }
public Tree<String> getBestParse(List<String> sentence) { // This implements the CKY algorithm int nEntries = sentence.size(); // hashmap to store back rules HashMap<Triplet<Integer, Integer, String>, Triplet<Integer, String, String>> backHash = new HashMap<Triplet<Integer, Integer, String>, Triplet<Integer, String, String>>(); // more efficient access with arrays, but must cast each time :( @SuppressWarnings("unchecked") Counter<String>[][] parseScores = (Counter<String>[][]) (new Counter[nEntries][nEntries]); for (int i = 0; i < nEntries; i++) { for (int j = 0; j < nEntries; j++) { parseScores[i][j] = new Counter<String>(); } } System.out.println(sentence.toString()); // First deal with the lexicons int index = 0; int span = 1; // All spans are 1 at the lexicon level for (String word : sentence) { for (String tag : lexicon.getAllTags()) { double score = lexicon.scoreTagging(word, tag); if (score >= 0.0) { // This lexicon may generate this word // We use a counter map in order to store the scores for this sentence parse. parseScores[index][index + span - 1].setCount(tag, score); } } index = index + 1; } // handle unary rules now // System.out.println("Lexicons found"); boolean added = true; while (added) { added = false; for (index = 0; index < sentence.size(); index++) { // For each index+ span pair, get the counter. Counter<String> count = parseScores[index][index + span - 1]; PriorityQueue<String> countAsPQ = count.asPriorityQueue(); while (countAsPQ.hasNext()) { String entry = countAsPQ.next(); // System.out.println("I am fine here!!"); List<UnaryRule> unaryRules = grammar.getUnaryRulesByChild(entry); for (UnaryRule rule : unaryRules) { // These are the unary rules which might give rise to the above preterminal double prob = rule.getScore() * parseScores[index][index + span - 1].getCount(entry); if (prob > parseScores[index][index + span - 1].getCount(rule.parent)) { parseScores[index][index + span - 1].setCount(rule.parent, prob); backHash.put( new Triplet<Integer, Integer, String>(index, index + span, rule.parent), new Triplet<Integer, String, String>(-1, entry, null)); added = true; } } } } } // System.out.println("Lexicon unaries dealt with"); // Now work with the grammar to produce higher level probabilities for (span = 2; span <= sentence.size(); span++) { for (int begin = 0; begin <= (sentence.size() - span); begin++) { int end = begin + span; for (int split = begin + 1; split <= end - 1; split++) { Counter<String> countLeft = parseScores[begin][split - 1]; Counter<String> countRight = parseScores[split][end - 1]; // List<BinaryRule> leftRules= new ArrayList<BinaryRule>(); HashMap<Integer, BinaryRule> leftMap = new HashMap<Integer, BinaryRule>(); // List<BinaryRule> rightRules=new ArrayList<BinaryRule>(); HashMap<Integer, BinaryRule> rightMap = new HashMap<Integer, BinaryRule>(); for (String entry : countLeft.keySet()) { for (BinaryRule rule : grammar.getBinaryRulesByLeftChild(entry)) { if (!leftMap.containsKey(rule.hashCode())) { leftMap.put(rule.hashCode(), rule); } } } for (String entry : countRight.keySet()) { for (BinaryRule rule : grammar.getBinaryRulesByRightChild(entry)) { if (!rightMap.containsKey(rule.hashCode())) { rightMap.put(rule.hashCode(), rule); } } } // System.out.println("About to enter the rules loops"); for (Integer ruleHash : leftMap.keySet()) { if (rightMap.containsKey(ruleHash)) { BinaryRule ruleRight = rightMap.get(ruleHash); double prob = ruleRight.getScore() * parseScores[begin][split - 1].getCount(ruleRight.leftChild) * parseScores[split][end - 1].getCount(ruleRight.rightChild); // System.out.println(begin+" "+ end +" "+ ruleRight.parent+ " "+ prob); if (prob > parseScores[begin][end - 1].getCount(ruleRight.parent)) { // System.out.println(begin+" "+ end +" "+ ruleRight.parent+ " "+ prob); // System.out.println("parentrule :"+ ruleRight.getParent()); parseScores[begin][end - 1].setCount(ruleRight.getParent(), prob); backHash.put( new Triplet<Integer, Integer, String>(begin, end, ruleRight.parent), new Triplet<Integer, String, String>( split, ruleRight.leftChild, ruleRight.rightChild)); } } } // System.out.println("Exited rules loop"); } // System.out.println("Grammar found for " + begin + " "+ end); // Now handle unary rules added = true; while (added) { added = false; Counter<String> count = parseScores[begin][end - 1]; PriorityQueue<String> countAsPriorityQueue = count.asPriorityQueue(); while (countAsPriorityQueue.hasNext()) { String entry = countAsPriorityQueue.next(); List<UnaryRule> unaryRules = grammar.getUnaryRulesByChild(entry); for (UnaryRule rule : unaryRules) { double prob = rule.getScore() * parseScores[begin][end - 1].getCount(entry); if (prob > parseScores[begin][end - 1].getCount(rule.parent)) { parseScores[begin][end - 1].setCount(rule.parent, prob); backHash.put( new Triplet<Integer, Integer, String>(begin, end, rule.parent), new Triplet<Integer, String, String>(-1, entry, null)); added = true; } } } } // System.out.println("Unaries dealt for " + begin + " "+ end); } } // Create and return the parse tree Tree<String> parseTree = new Tree<String>("null"); // System.out.println(parseScores.getCounter(0+" "+sentence.size()).toString()); // Pick the argmax String parent = parseScores[0][nEntries - 1].argMax(); // Or pick root. This second one is preferred since sentences are meant to have ROOT as their // root node. parent = "ROOT"; parseTree = getParseTree(sentence, backHash, 0, sentence.size(), parent); // System.out.println("PARSE SCORES"); // System.out.println(parseScores.toString()); // System.out.println("BACK HASH"); // System.out.println(backHash.toString()); // parseTree = addRoot(parseTree); // System.out.println(parseTree.toString()); // return parseTree; return TreeAnnotations.unAnnotateTree(parseTree); }
private Regexp internalGetBoundedRegexp(Grammar g, String startSymbol, int bound) { if (regexpCache.containsKeys(startSymbol, bound)) return regexpCache.get(startSymbol, bound); // removing epsilons may not succeed for start symbol. We check if that's the case. boolean canGenerateEmptyString = g.containsEpsilonProduction(startSymbol); List<Regexp> x = new ArrayList<Regexp>(); for (GrammarProduction prod : g.getRule(startSymbol).getProductions()) { List<GrammarProductionElement> elems = prod.getElements(); if (canGenerateEmptyString || elems.size() <= bound) { // uses the fact that every symbol (other than start) produces at least // one terminal List<List<Integer>> distros = new ArrayList<List<Integer>>(distributions(bound, elems)); distrosLoop: for (int j = 0; j < distros.size(); j++) { List<Integer> distro = distros.get(j); Regexp[] exps = new Regexp[distro.size()]; for (int i = 0; i < elems.size(); i++) { GrammarProductionElement elem = elems.get(i); int sizeForElem = distro.get(i); if (terms_are_single_char) { if (sizeForElem > 1 && (elem.getKind() == GrammarElementKind.GTERMINAL || elem.getKind() == GrammarElementKind.GSPECIAL)) { continue distrosLoop; // no way you can generate a string longer than 1 from a terminal } if (sizeForElem == 1 && elem.getKind() == GrammarElementKind.GTERMINAL) { TerminalElement te = (TerminalElement) elem; exps[i] = HampiConstraints.constRegexp(te.getNameNoQuotes()); } else if (sizeForElem == 1 && elem.getKind() == GrammarElementKind.GSPECIAL) { SpecialElement spec = (SpecialElement) elem; exps[i] = HampiConstraints.constRegexp(spec.getNameNoDelimiters()); } else if (elem.getKind() == GrammarElementKind.GNONTERMINAL) { NonterminalElement nt = (NonterminalElement) elem; if (bounds.containsKey(nt) && bounds.get(nt) < sizeForElem) { // cannot generate a string longer than the upper bound on // all strings generatable from the nonterminal continue distrosLoop; } Regexp subRegexp = internalGetBoundedRegexp(g, nt.getName(), sizeForElem); if (subRegexp != null) { exps[i] = subRegexp; } else { continue distrosLoop; } } else throw new IllegalStateException("expected a nonterminal or special" + elem); } else { if (elem.getKind() == GrammarElementKind.GSPECIAL) throw new UnsupportedOperationException("not implemented yet"); if (elem.getKind() == GrammarElementKind.GTERMINAL) { TerminalElement term = (TerminalElement) elem; if (term.getNameNoQuotes().length() != sizeForElem) { continue distrosLoop; // no way you can generate a string this long } else { exps[i] = HampiConstraints.constRegexp(term.getNameNoQuotes()); } } else if (elem.getKind() == GrammarElementKind.GNONTERMINAL) { NonterminalElement nt = (NonterminalElement) elem; if (bounds.containsKey(nt) && bounds.get(nt) < sizeForElem) { // cannot generate a string longer than the upper bound on // all strings generatable from the nonterminal continue distrosLoop; } Regexp subRegexp = internalGetBoundedRegexp(g, nt.getName(), sizeForElem); if (subRegexp != null) { exps[i] = subRegexp; } else { continue distrosLoop; } } else throw new IllegalStateException("expected a nonterminal or special" + elem); } } Regexp e; if (exps.length == 1) { e = exps[0]; } else { e = HampiConstraints.concatRegexp(exps); } if (!x.contains(e)) { x.add(e); } } } } Regexp result; if (x.isEmpty() && !canGenerateEmptyString) { result = null; } else if (x.isEmpty() && canGenerateEmptyString) { Hampi h = new Hampi(); result = h.constRegexp(""); } else if (x.size() == 1) { result = x.get(0); } else { Hampi h = new Hampi(); result = h.orRegexp(x.toArray(new Regexp[x.size()])); } regexpCache.put(startSymbol, bound, result); return result; }
public static void main(String[] args) { OptionParser optParser = new OptionParser(Options.class); Options opts = (Options) optParser.parse(args, true); // provide feedback on command-line arguments System.out.println("Calling with " + optParser.getPassedInOptions()); String path = opts.path; // int lang = opts.lang; System.out.println("Loading trees from " + path + " and using language " + opts.treebank); double trainingFractionToKeep = opts.trainingFractionToKeep; int maxSentenceLength = opts.maxSentenceLength; System.out.println("Will remove sentences with more than " + maxSentenceLength + " words."); HORIZONTAL_MARKOVIZATION = opts.horizontalMarkovization; VERTICAL_MARKOVIZATION = opts.verticalMarkovization; System.out.println( "Using horizontal=" + HORIZONTAL_MARKOVIZATION + " and vertical=" + VERTICAL_MARKOVIZATION + " markovization."); Binarization binarization = opts.binarization; System.out.println( "Using " + binarization.name() + " binarization."); // and "+annotateString+"."); double randomness = opts.randomization; System.out.println("Using a randomness value of " + randomness); String outFileName = opts.outFileName; if (outFileName == null) { System.out.println("Output File name is required."); System.exit(-1); } else System.out.println("Using grammar output file " + outFileName + "."); VERBOSE = opts.verbose; RANDOM = new Random(opts.randSeed); System.out.println("Random number generator seeded at " + opts.randSeed + "."); boolean manualAnnotation = false; boolean baseline = opts.baseline; boolean noSplit = opts.noSplit; int numSplitTimes = opts.numSplits; if (baseline) numSplitTimes = 0; String splitGrammarFile = opts.inFile; int allowedDroppingIters = opts.di; int maxIterations = opts.splitMaxIterations; int minIterations = opts.splitMinIterations; if (minIterations > 0) System.out.println("I will do at least " + minIterations + " iterations."); double[] smoothParams = {opts.smoothingParameter1, opts.smoothingParameter2}; System.out.println("Using smoothing parameters " + smoothParams[0] + " and " + smoothParams[1]); boolean allowMoreSubstatesThanCounts = false; boolean findClosedUnaryPaths = opts.findClosedUnaryPaths; Corpus corpus = new Corpus( path, opts.treebank, trainingFractionToKeep, false, opts.skipSection, opts.skipBilingual); List<Tree<String>> trainTrees = Corpus.binarizeAndFilterTrees( corpus.getTrainTrees(), VERTICAL_MARKOVIZATION, HORIZONTAL_MARKOVIZATION, maxSentenceLength, binarization, manualAnnotation, VERBOSE); List<Tree<String>> validationTrees = Corpus.binarizeAndFilterTrees( corpus.getValidationTrees(), VERTICAL_MARKOVIZATION, HORIZONTAL_MARKOVIZATION, maxSentenceLength, binarization, manualAnnotation, VERBOSE); Numberer tagNumberer = Numberer.getGlobalNumberer("tags"); // for (Tree<String> t : trainTrees){ // System.out.println(t); // } if (opts.trainOnDevSet) { System.out.println("Adding devSet to training data."); trainTrees.addAll(validationTrees); } if (opts.lowercase) { System.out.println("Lowercasing the treebank."); Corpus.lowercaseWords(trainTrees); Corpus.lowercaseWords(validationTrees); } int nTrees = trainTrees.size(); System.out.println("There are " + nTrees + " trees in the training set."); double filter = opts.filter; if (filter > 0) System.out.println( "Will remove rules with prob under " + filter + ".\nEven though only unlikely rules are pruned the training LL is not guaranteed to increase in every round anymore " + "(especially when we are close to converging)." + "\nFurthermore it increases the variance because 'good' rules can be pruned away in early stages."); short nSubstates = opts.nSubStates; short[] numSubStatesArray = initializeSubStateArray(trainTrees, validationTrees, tagNumberer, nSubstates); if (baseline) { short one = 1; Arrays.fill(numSubStatesArray, one); System.out.println("Training just the baseline grammar (1 substate for all states)"); randomness = 0.0f; } if (VERBOSE) { for (int i = 0; i < numSubStatesArray.length; i++) { System.out.println("Tag " + (String) tagNumberer.object(i) + " " + i); } } System.out.println("There are " + numSubStatesArray.length + " observed categories."); // initialize lexicon and grammar Lexicon lexicon = null, maxLexicon = null, previousLexicon = null; Grammar grammar = null, maxGrammar = null, previousGrammar = null; double maxLikelihood = Double.NEGATIVE_INFINITY; // String smootherStr = opts.smooth; // Smoother lexiconSmoother = null; // Smoother grammarSmoother = null; // if (splitGrammarFile!=null){ // lexiconSmoother = maxLexicon.smoother; // grammarSmoother = maxGrammar.smoother; // System.out.println("Using smoother from input grammar."); // } // else if (smootherStr.equals("NoSmoothing")) // lexiconSmoother = grammarSmoother = new NoSmoothing(); // else if (smootherStr.equals("SmoothAcrossParentBits")) { // lexiconSmoother = grammarSmoother = new SmoothAcrossParentBits(grammarSmoothing, // maxGrammar.splitTrees); // } // else // throw new Error("I didn't understand the type of smoother '"+smootherStr+"'"); // System.out.println("Using smoother "+smootherStr); // EM: iterate until the validation likelihood drops for four consecutive // iterations int iter = 0; int droppingIter = 0; // If we are splitting, we load the old grammar and start off by splitting. int startSplit = 0; if (splitGrammarFile != null) { System.out.println("Loading old grammar from " + splitGrammarFile); startSplit = 1; // we've already trained the grammar ParserData pData = ParserData.Load(splitGrammarFile); maxGrammar = pData.gr; maxLexicon = pData.lex; numSubStatesArray = maxGrammar.numSubStates; previousGrammar = grammar = maxGrammar; previousLexicon = lexicon = maxLexicon; Numberer.setNumberers(pData.getNumbs()); tagNumberer = Numberer.getGlobalNumberer("tags"); System.out.println("Loading old grammar complete."); if (noSplit) { System.out.println("Will NOT split the loaded grammar."); startSplit = 0; } } double mergingPercentage = opts.mergingPercentage; boolean separateMergingThreshold = opts.separateMergingThreshold; if (mergingPercentage > 0) { System.out.println( "Will merge " + (int) (mergingPercentage * 100) + "% of the splits in each round."); System.out.println( "The threshold for merging lexical and phrasal categories will be set separately: " + separateMergingThreshold); } StateSetTreeList trainStateSetTrees = new StateSetTreeList(trainTrees, numSubStatesArray, false, tagNumberer); StateSetTreeList validationStateSetTrees = new StateSetTreeList(validationTrees, numSubStatesArray, false, tagNumberer); // deletePC); // get rid of the old trees trainTrees = null; validationTrees = null; corpus = null; System.gc(); if (opts.simpleLexicon) { System.out.println( "Replacing words which have been seen less than 5 times with their signature."); Corpus.replaceRareWords( trainStateSetTrees, new SimpleLexicon(numSubStatesArray, -1), opts.rare); } // If we're training without loading a split grammar, then we run once without splitting. if (splitGrammarFile == null) { grammar = new Grammar(numSubStatesArray, findClosedUnaryPaths, new NoSmoothing(), null, filter); Lexicon tmp_lexicon = (opts.simpleLexicon) ? new SimpleLexicon( numSubStatesArray, -1, smoothParams, new NoSmoothing(), filter, trainStateSetTrees) : new SophisticatedLexicon( numSubStatesArray, SophisticatedLexicon.DEFAULT_SMOOTHING_CUTOFF, smoothParams, new NoSmoothing(), filter); int n = 0; boolean secondHalf = false; for (Tree<StateSet> stateSetTree : trainStateSetTrees) { secondHalf = (n++ > nTrees / 2.0); tmp_lexicon.trainTree(stateSetTree, randomness, null, secondHalf, false, opts.rare); } lexicon = (opts.simpleLexicon) ? new SimpleLexicon( numSubStatesArray, -1, smoothParams, new NoSmoothing(), filter, trainStateSetTrees) : new SophisticatedLexicon( numSubStatesArray, SophisticatedLexicon.DEFAULT_SMOOTHING_CUTOFF, smoothParams, new NoSmoothing(), filter); for (Tree<StateSet> stateSetTree : trainStateSetTrees) { secondHalf = (n++ > nTrees / 2.0); lexicon.trainTree(stateSetTree, randomness, tmp_lexicon, secondHalf, false, opts.rare); grammar.tallyUninitializedStateSetTree(stateSetTree); } lexicon.tieRareWordStats(opts.rare); lexicon.optimize(); // SSIE ((SophisticatedLexicon) lexicon).overwriteWithMaxent(); grammar.optimize(randomness); // System.out.println(grammar); previousGrammar = maxGrammar = grammar; // needed for baseline - when there is no EM loop previousLexicon = maxLexicon = lexicon; } // the main loop: split and train the grammar for (int splitIndex = startSplit; splitIndex < numSplitTimes * 3; splitIndex++) { // now do either a merge or a split and the end a smooth // on odd iterations merge, on even iterations split String opString = ""; if (splitIndex % 3 == 2) { // (splitIndex==numSplitTimes*2){ if (opts.smooth.equals("NoSmoothing")) continue; System.out.println("Setting smoother for grammar and lexicon."); Smoother grSmoother = new SmoothAcrossParentBits(0.01, maxGrammar.splitTrees); Smoother lexSmoother = new SmoothAcrossParentBits(0.1, maxGrammar.splitTrees); // Smoother grSmoother = new SmoothAcrossParentSubstate(0.01); // Smoother lexSmoother = new SmoothAcrossParentSubstate(0.1); maxGrammar.setSmoother(grSmoother); maxLexicon.setSmoother(lexSmoother); minIterations = maxIterations = opts.smoothMaxIterations; opString = "smoothing"; } else if (splitIndex % 3 == 0) { // the case where we split if (opts.noSplit) continue; System.out.println( "Before splitting, we have a total of " + maxGrammar.totalSubStates() + " substates."); CorpusStatistics corpusStatistics = new CorpusStatistics(tagNumberer, trainStateSetTrees); int[] counts = corpusStatistics.getSymbolCounts(); maxGrammar = maxGrammar.splitAllStates(randomness, counts, allowMoreSubstatesThanCounts, 0); maxLexicon = maxLexicon.splitAllStates(counts, allowMoreSubstatesThanCounts, 0); Smoother grSmoother = new NoSmoothing(); Smoother lexSmoother = new NoSmoothing(); maxGrammar.setSmoother(grSmoother); maxLexicon.setSmoother(lexSmoother); System.out.println( "After splitting, we have a total of " + maxGrammar.totalSubStates() + " substates."); System.out.println( "Rule probabilities are NOT normalized in the split, therefore the training LL is not guaranteed to improve between iteration 0 and 1!"); opString = "splitting"; maxIterations = opts.splitMaxIterations; minIterations = opts.splitMinIterations; } else { if (mergingPercentage == 0) continue; // the case where we merge double[][] mergeWeights = GrammarMerger.computeMergeWeights(maxGrammar, maxLexicon, trainStateSetTrees); double[][][] deltas = GrammarMerger.computeDeltas(maxGrammar, maxLexicon, mergeWeights, trainStateSetTrees); boolean[][][] mergeThesePairs = GrammarMerger.determineMergePairs( deltas, separateMergingThreshold, mergingPercentage, maxGrammar); grammar = GrammarMerger.doTheMerges(maxGrammar, maxLexicon, mergeThesePairs, mergeWeights); short[] newNumSubStatesArray = grammar.numSubStates; trainStateSetTrees = new StateSetTreeList(trainStateSetTrees, newNumSubStatesArray, false); validationStateSetTrees = new StateSetTreeList(validationStateSetTrees, newNumSubStatesArray, false); // retrain lexicon to finish the lexicon merge (updates the unknown words model)... lexicon = (opts.simpleLexicon) ? new SimpleLexicon( newNumSubStatesArray, -1, smoothParams, maxLexicon.getSmoother(), filter, trainStateSetTrees) : new SophisticatedLexicon( newNumSubStatesArray, SophisticatedLexicon.DEFAULT_SMOOTHING_CUTOFF, maxLexicon.getSmoothingParams(), maxLexicon.getSmoother(), maxLexicon.getPruningThreshold()); boolean updateOnlyLexicon = true; double trainingLikelihood = GrammarTrainer.doOneEStep( grammar, maxLexicon, null, lexicon, trainStateSetTrees, updateOnlyLexicon, opts.rare); // System.out.println("The training LL is "+trainingLikelihood); lexicon .optimize(); // Grammar.RandomInitializationType.INITIALIZE_WITH_SMALL_RANDOMIZATION); // // M Step GrammarMerger.printMergingStatistics(maxGrammar, grammar); opString = "merging"; maxGrammar = grammar; maxLexicon = lexicon; maxIterations = opts.mergeMaxIterations; minIterations = opts.mergeMinIterations; } // update the substate dependent objects previousGrammar = grammar = maxGrammar; previousLexicon = lexicon = maxLexicon; droppingIter = 0; numSubStatesArray = grammar.numSubStates; trainStateSetTrees = new StateSetTreeList(trainStateSetTrees, numSubStatesArray, false); validationStateSetTrees = new StateSetTreeList(validationStateSetTrees, numSubStatesArray, false); maxLikelihood = calculateLogLikelihood(maxGrammar, maxLexicon, validationStateSetTrees); System.out.println( "After " + opString + " in the " + (splitIndex / 3 + 1) + "th round, we get a validation likelihood of " + maxLikelihood); iter = 0; // the inner loop: train the grammar via EM until validation likelihood reliably drops do { iter += 1; System.out.println("Beginning iteration " + (iter - 1) + ":"); // 1) Compute the validation likelihood of the previous iteration System.out.print("Calculating validation likelihood..."); double validationLikelihood = calculateLogLikelihood( previousGrammar, previousLexicon, validationStateSetTrees); // The validation LL of previousGrammar/previousLexicon System.out.println("done: " + validationLikelihood); // 2) Perform the E step while computing the training likelihood of the previous iteration System.out.print("Calculating training likelihood..."); grammar = new Grammar( grammar.numSubStates, grammar.findClosedPaths, grammar.smoother, grammar, grammar.threshold); lexicon = (opts.simpleLexicon) ? new SimpleLexicon( grammar.numSubStates, -1, smoothParams, lexicon.getSmoother(), filter, trainStateSetTrees) : new SophisticatedLexicon( grammar.numSubStates, SophisticatedLexicon.DEFAULT_SMOOTHING_CUTOFF, lexicon.getSmoothingParams(), lexicon.getSmoother(), lexicon.getPruningThreshold()); boolean updateOnlyLexicon = false; double trainingLikelihood = doOneEStep( previousGrammar, previousLexicon, grammar, lexicon, trainStateSetTrees, updateOnlyLexicon, opts.rare); // The training LL of previousGrammar/previousLexicon System.out.println("done: " + trainingLikelihood); // 3) Perform the M-Step lexicon.optimize(); // M Step grammar.optimize(0); // M Step // 4) Check whether previousGrammar/previousLexicon was in fact better than the best if (iter < minIterations || validationLikelihood >= maxLikelihood) { maxLikelihood = validationLikelihood; maxGrammar = previousGrammar; maxLexicon = previousLexicon; droppingIter = 0; } else { droppingIter++; } // 5) advance the 'pointers' previousGrammar = grammar; previousLexicon = lexicon; } while ((droppingIter < allowedDroppingIters) && (!baseline) && (iter < maxIterations)); // Dump a grammar file to disk from time to time ParserData pData = new ParserData( maxLexicon, maxGrammar, null, Numberer.getNumberers(), numSubStatesArray, VERTICAL_MARKOVIZATION, HORIZONTAL_MARKOVIZATION, binarization); String outTmpName = outFileName + "_" + (splitIndex / 3 + 1) + "_" + opString + ".gr"; System.out.println("Saving grammar to " + outTmpName + "."); if (pData.Save(outTmpName)) System.out.println("Saving successful."); else System.out.println("Saving failed!"); pData = null; } // The last grammar/lexicon has not yet been evaluated. Even though the validation likelihood // has been dropping in the past few iteration, there is still a chance that the last one was in // fact the best so just in case we evaluate it. System.out.print("Calculating last validation likelihood..."); double validationLikelihood = calculateLogLikelihood(grammar, lexicon, validationStateSetTrees); System.out.println( "done.\n Iteration " + iter + " (final) gives validation likelihood " + validationLikelihood); if (validationLikelihood > maxLikelihood) { maxLikelihood = validationLikelihood; maxGrammar = previousGrammar; maxLexicon = previousLexicon; } ParserData pData = new ParserData( maxLexicon, maxGrammar, null, Numberer.getNumberers(), numSubStatesArray, VERTICAL_MARKOVIZATION, HORIZONTAL_MARKOVIZATION, binarization); System.out.println("Saving grammar to " + outFileName + "."); System.out.println("It gives a validation data log likelihood of: " + maxLikelihood); if (pData.Save(outFileName)) System.out.println("Saving successful."); else System.out.println("Saving failed!"); System.exit(0); }
public Tree<String> getBestParse(List<String> sentence) { // TODO: implement this method int n = sentence.size(); // System.out.println("getBestParse: n=" + n); List<List<Map<Object, Double>>> scores = new ArrayList<List<Map<Object, Double>>>(n + 1); for (int i = 0; i < n + 1; i++) { List<Map<Object, Double>> row = new ArrayList<Map<Object, Double>>(n + 1); for (int j = 0; j < n + 1; j++) { row.add(new HashMap<Object, Double>()); } scores.add(row); } List<List<Map<Object, Triplet<Integer, Object, Object>>>> backs = new ArrayList<List<Map<Object, Triplet<Integer, Object, Object>>>>(n + 1); for (int i = 0; i < n + 1; i++) { List<Map<Object, Triplet<Integer, Object, Object>>> row = new ArrayList<Map<Object, Triplet<Integer, Object, Object>>>(n + 1); for (int j = 0; j < n + 1; j++) { row.add(new HashMap<Object, Triplet<Integer, Object, Object>>()); } backs.add(row); } /* System.out.println("scores=" + scores.size() + "x" + scores.get(0).size()); System.out.println("backs=" + backs.size() + "x" + backs.get(0).size()); printChart(scores, backs, "scores"); */ // First the Lexicon for (int i = 0; i < n; i++) { String word = sentence.get(i); for (String tag : lexicon.getAllTags()) { UnaryRule A = new UnaryRule(tag, word); A.setScore(Math.log(lexicon.scoreTagging(word, tag))); scores.get(i).get(i + 1).put(A, A.getScore()); backs.get(i).get(i + 1).put(A, null); } // System.out.println("Starting unaries: i=" + i + ",n=" + n ); // Handle unaries boolean added = true; while (added) { added = false; Map<Object, Double> A_scores = scores.get(i).get(i + 1); // Don't modify the dict we are iterating List<Object> A_keys = copyKeys(A_scores); // for (int j = 0; j < 5 && j < A_keys.size(); j++) { // System.out.print("," + j + "=" + A_scores.get(A_keys.get(j))); // } for (Object oB : A_keys) { UnaryRule B = (UnaryRule) oB; for (UnaryRule A : grammar.getUnaryRulesByChild(B.getParent())) { double prob = Math.log(A.getScore()) + A_scores.get(B); if (prob > -1000.0) { if (!A_scores.containsKey(A) || prob > A_scores.get(A)) { // System.out.print(" *A=" + A + ", B=" + B); // System.out.print(", prob=" + prob); // System.out.println(", A_scores.get(A)=" + A_scores.get(A)); A_scores.put(A, prob); backs.get(i).get(i + 1).put(A, new Triplet<Integer, Object, Object>(-1, B, null)); added = true; } // System.out.println(", added=" + added); } } } // System.out.println(", A_scores=" + A_scores.size() + ", added=" + added); } } // printChart(scores, backs, "scores with Lexicon"); // Do higher layers // Naming is based on rules: A -> B,C long startTime = new Date().getTime(); for (int span = 2; span < n + 1; span++) { for (int begin = 0; begin < n + 1 - span; begin++) { int end = begin + span; Map<Object, Double> A_scores = scores.get(begin).get(end); Map<Object, Triplet<Integer, Object, Object>> A_backs = backs.get(begin).get(end); for (int split = begin + 1; split < end; split++) { Map<Object, Double> B_scores = scores.get(begin).get(split); Map<Object, Double> C_scores = scores.get(split).get(end); List<Object> B_list = new ArrayList<Object>(B_scores.keySet()); List<Object> C_list = new ArrayList<Object>(C_scores.keySet()); // This is a key optimization. !@#$ // It avoids a B_list.size() x C_list.size() search in the for (Object B : B_list) loop Map<String, List<Object>> C_map = new HashMap<String, List<Object>>(); for (Object C : C_list) { String parent = getParent(C); if (!C_map.containsKey(parent)) { C_map.put(parent, new ArrayList<Object>()); } C_map.get(parent).add(C); } for (Object B : B_list) { for (BinaryRule A : grammar.getBinaryRulesByLeftChild(getParent(B))) { if (C_map.containsKey(A.getRightChild())) { for (Object C : C_map.get(A.getRightChild())) { // We now have A which has B as left child and C as right child double prob = Math.log(A.getScore()) + B_scores.get(B) + C_scores.get(C); if (!A_scores.containsKey(A) || prob > A_scores.get(A)) { A_scores.put(A, prob); A_backs.put(A, new Triplet<Integer, Object, Object>(split, B, C)); } } } } } } // Handle unaries: A -> B boolean added = true; while (added) { added = false; // Don't modify the dict we are iterating List<Object> A_keys = copyKeys(A_scores); for (Object oB : A_keys) { for (UnaryRule A : grammar.getUnaryRulesByChild(getParent(oB))) { double prob = Math.log(A.getScore()) + A_scores.get(oB); if (!A_scores.containsKey(A) || prob > A_scores.get(A)) { A_scores.put(A, prob); A_backs.put(A, new Triplet<Integer, Object, Object>(-1, oB, null)); added = true; } } } } } } // printChart(scores, backs, "scores with Lexicon and Grammar"); Map<Object, Double> topOfChart = scores.get(0).get(n); System.out.println("topOfChart: " + topOfChart.size()); /* for (Object o: topOfChart.keySet()) { System.out.println("o=" + o + ", score=" + topOfChart.getCount(o)); } */ // All parses have "ROOT" at top of tree Object bestKey = null; Object secondBestKey = null; double bestScore = Double.NEGATIVE_INFINITY; double secondBestScore = Double.NEGATIVE_INFINITY; for (Object key : topOfChart.keySet()) { double score = topOfChart.get(key); if (score >= secondBestScore || secondBestKey == null) { secondBestKey = key; secondBestScore = score; } if ("ROOT".equals(getParent(key)) && (score >= bestScore || bestKey == null)) { bestKey = key; bestScore = score; } } if (bestKey == null) { bestKey = secondBestKey; System.out.println("secondBestKey=" + secondBestKey); } if (bestKey == null) { for (Object key : topOfChart.keySet()) { System.out.println("val=" + topOfChart.get(key) + ", key=" + key); } } System.out.println("bestKey=" + bestKey + ", log(prob)=" + topOfChart.get(bestKey)); Tree<String> result = makeTree(backs, 0, n, bestKey); if (!"ROOT".equals(result.getLabel())) { List<Tree<String>> children = new ArrayList<Tree<String>>(); children.add(result); result = new Tree<String>("ROOT", children); // !@#$ } /* System.out.println("=================================================="); System.out.println(result); System.out.println("====================^^^^^^========================"); */ return TreeAnnotations.unAnnotateTree(result); }
public Tree<String> getBestParseOld(List<String> sentence) { // TODO: This implements the CKY algorithm CounterMap<String, String> parseScores = new CounterMap<String, String>(); System.out.println(sentence.toString()); // First deal with the lexicons int index = 0; int span = 1; // All spans are 1 at the lexicon level for (String word : sentence) { for (String tag : lexicon.getAllTags()) { double score = lexicon.scoreTagging(word, tag); if (score >= 0.0) { // This lexicon may generate this word // We use a counter map in order to store the scores for this sentence parse. parseScores.setCount(index + " " + (index + span), tag, score); } } index = index + 1; } // handle unary rules now HashMap<String, Triplet<Integer, String, String>> backHash = new HashMap< String, Triplet<Integer, String, String>>(); // hashmap to store back propation // System.out.println("Lexicons found"); Boolean added = true; while (added) { added = false; for (index = 0; index < sentence.size(); index++) { // For each index+ span pair, get the counter. Counter<String> count = parseScores.getCounter(index + " " + (index + span)); PriorityQueue<String> countAsPQ = count.asPriorityQueue(); while (countAsPQ.hasNext()) { String entry = countAsPQ.next(); // System.out.println("I am fine here!!"); List<UnaryRule> unaryRules = grammar.getUnaryRulesByChild(entry); for (UnaryRule rule : unaryRules) { // These are the unary rules which might give rise to the above preterminal double prob = rule.getScore() * parseScores.getCount(index + " " + (index + span), entry); if (prob > parseScores.getCount(index + " " + (index + span), rule.parent)) { parseScores.setCount(index + " " + (index + span), rule.parent, prob); backHash.put( index + " " + (index + span) + " " + rule.parent, new Triplet<Integer, String, String>(-1, entry, null)); added = true; } } } } } // System.out.println("Lexicon unaries dealt with"); // Now work with the grammar to produce higher level probabilities for (span = 2; span <= sentence.size(); span++) { for (int begin = 0; begin <= (sentence.size() - span); begin++) { int end = begin + span; for (int split = begin + 1; split <= end - 1; split++) { Counter<String> countLeft = parseScores.getCounter(begin + " " + split); Counter<String> countRight = parseScores.getCounter(split + " " + end); // List<BinaryRule> leftRules= new ArrayList<BinaryRule>(); HashMap<Integer, BinaryRule> leftMap = new HashMap<Integer, BinaryRule>(); // List<BinaryRule> rightRules=new ArrayList<BinaryRule>(); HashMap<Integer, BinaryRule> rightMap = new HashMap<Integer, BinaryRule>(); for (String entry : countLeft.keySet()) { for (BinaryRule rule : grammar.getBinaryRulesByLeftChild(entry)) { if (!leftMap.containsKey(rule.hashCode())) { leftMap.put(rule.hashCode(), rule); } } } for (String entry : countRight.keySet()) { for (BinaryRule rule : grammar.getBinaryRulesByRightChild(entry)) { if (!rightMap.containsKey(rule.hashCode())) { rightMap.put(rule.hashCode(), rule); } } } // System.out.println("About to enter the rules loops"); for (Integer ruleHash : leftMap.keySet()) { if (rightMap.containsKey(ruleHash)) { BinaryRule ruleRight = rightMap.get(ruleHash); double prob = ruleRight.getScore() * parseScores.getCount(begin + " " + split, ruleRight.leftChild) * parseScores.getCount(split + " " + end, ruleRight.rightChild); // System.out.println(begin+" "+ end +" "+ ruleRight.parent+ " "+ prob); if (prob > parseScores.getCount(begin + " " + end, ruleRight.parent)) { // System.out.println(begin+" "+ end +" "+ ruleRight.parent+ " "+ prob); // System.out.println("parentrule :"+ ruleRight.getParent()); parseScores.setCount(begin + " " + end, ruleRight.getParent(), prob); backHash.put( begin + " " + end + " " + ruleRight.parent, new Triplet<Integer, String, String>( split, ruleRight.leftChild, ruleRight.rightChild)); } } } // System.out.println("Exited rules loop"); } // System.out.println("Grammar found for " + begin + " "+ end); // Now handle unary rules added = true; while (added) { added = false; Counter<String> count = parseScores.getCounter(begin + " " + end); PriorityQueue<String> countAsPriorityQueue = count.asPriorityQueue(); while (countAsPriorityQueue.hasNext()) { String entry = countAsPriorityQueue.next(); List<UnaryRule> unaryRules = grammar.getUnaryRulesByChild(entry); for (UnaryRule rule : unaryRules) { double prob = rule.getScore() * parseScores.getCount(begin + " " + (end), entry); if (prob > parseScores.getCount(begin + " " + (end), rule.parent)) { parseScores.setCount(begin + " " + (end), rule.parent, prob); backHash.put( begin + " " + (end) + " " + rule.parent, new Triplet<Integer, String, String>(-1, entry, null)); added = true; } } } } // System.out.println("Unaries dealt for " + begin + " "+ end); } } // Create and return the parse tree Tree<String> parseTree = new Tree<String>("null"); // System.out.println(parseScores.getCounter(0+" "+sentence.size()).toString()); String parent = parseScores.getCounter(0 + " " + sentence.size()).argMax(); if (parent == null) { System.out.println(parseScores.getCounter(0 + " " + sentence.size()).toString()); System.out.println("THIS IS WEIRD"); } parent = "ROOT"; parseTree = getParseTreeOld(sentence, backHash, 0, sentence.size(), parent); // System.out.println("PARSE SCORES"); // System.out.println(parseScores.toString()); // System.out.println("BACK HASH"); // System.out.println(backHash.toString()); // parseTree = addRoot(parseTree); // System.out.println(parseTree.toString()); // return parseTree; return TreeAnnotations.unAnnotateTree(parseTree); }
public void process() { boolean exceptionWhenWritingLexerFile = false; String lexerGrammarFileName = null; // necessary at this scope to have access in the catch below // Have to be tricky here when Maven or build tools call in and must new Tool() // before setting options. The banner won't display that way! if (isVerbose() && showBanner) { ErrorManager.info("ANTLR Parser Generator Version " + VERSION); showBanner = false; } try { sortGrammarFiles(); // update grammarFileNames } catch (Exception e) { ErrorManager.error(ErrorManager.MSG_INTERNAL_ERROR, e); } catch (Error e) { ErrorManager.error(ErrorManager.MSG_INTERNAL_ERROR, e); } for (String grammarFileName : grammarFileNames) { // If we are in make mode (to support build tools like Maven) and the // file is already up to date, then we do not build it (and in verbose mode // we will say so). if (make) { try { if (!buildRequired(grammarFileName)) continue; } catch (Exception e) { ErrorManager.error(ErrorManager.MSG_INTERNAL_ERROR, e); } } if (isVerbose() && !isDepend()) { System.out.println(grammarFileName); } try { if (isDepend()) { BuildDependencyGenerator dep = new BuildDependencyGenerator(this, grammarFileName); /* List outputFiles = dep.getGeneratedFileList(); List dependents = dep.getDependenciesFileList(); System.out.println("output: "+outputFiles); System.out.println("dependents: "+dependents); */ System.out.println(dep.getDependencies()); continue; } Grammar grammar = getRootGrammar(grammarFileName); // we now have all grammars read in as ASTs // (i.e., root and all delegates) grammar.composite.assignTokenTypes(); grammar.composite.defineGrammarSymbols(); grammar.composite.createNFAs(); generateRecognizer(grammar); if (isPrintGrammar()) { grammar.printGrammar(System.out); } if (isReport()) { GrammarReport greport = new GrammarReport(grammar); System.out.println(greport.toString()); // print out a backtracking report too (that is not encoded into log) System.out.println(greport.getBacktrackingReport()); // same for aborted NFA->DFA conversions System.out.println(greport.getAnalysisTimeoutReport()); } if (isProfile()) { GrammarReport greport = new GrammarReport(grammar); Stats.writeReport(GrammarReport.GRAMMAR_STATS_FILENAME, greport.toNotifyString()); } // now handle the lexer if one was created for a merged spec String lexerGrammarStr = grammar.getLexerGrammar(); // System.out.println("lexer grammar:\n"+lexerGrammarStr); if (grammar.type == Grammar.COMBINED && lexerGrammarStr != null) { lexerGrammarFileName = grammar.getImplicitlyGeneratedLexerFileName(); try { Writer w = getOutputFile(grammar, lexerGrammarFileName); w.write(lexerGrammarStr); w.close(); } catch (IOException e) { // emit different error message when creating the implicit lexer fails // due to write permission error exceptionWhenWritingLexerFile = true; throw e; } try { StringReader sr = new StringReader(lexerGrammarStr); Grammar lexerGrammar = new Grammar(); lexerGrammar.composite.watchNFAConversion = internalOption_watchNFAConversion; lexerGrammar.implicitLexer = true; lexerGrammar.setTool(this); File lexerGrammarFullFile = new File(getFileDirectory(lexerGrammarFileName), lexerGrammarFileName); lexerGrammar.setFileName(lexerGrammarFullFile.toString()); lexerGrammar.importTokenVocabulary(grammar); lexerGrammar.parseAndBuildAST(sr); sr.close(); lexerGrammar.composite.assignTokenTypes(); lexerGrammar.composite.defineGrammarSymbols(); lexerGrammar.composite.createNFAs(); generateRecognizer(lexerGrammar); } finally { // make sure we clean up if (deleteTempLexer) { File outputDir = getOutputDirectory(lexerGrammarFileName); File outputFile = new File(outputDir, lexerGrammarFileName); outputFile.delete(); } } } } catch (IOException e) { if (exceptionWhenWritingLexerFile) { ErrorManager.error(ErrorManager.MSG_CANNOT_WRITE_FILE, lexerGrammarFileName, e); } else { ErrorManager.error(ErrorManager.MSG_CANNOT_OPEN_FILE, grammarFileName); } } catch (Exception e) { ErrorManager.error(ErrorManager.MSG_INTERNAL_ERROR, grammarFileName, e); } /* finally { System.out.println("creates="+ Interval.creates); System.out.println("hits="+ Interval.hits); System.out.println("misses="+ Interval.misses); System.out.println("outOfRange="+ Interval.outOfRange); } */ } }