示例#1
0
文件: Tool.java 项目: laiello/perseph
  /**
   * Create NFA, DFA and generate code for grammar. Create NFA for any delegates first. Once all NFA
   * are created, it's ok to create DFA, which must check for left-recursion. That check is done by
   * walking the full NFA, which therefore must be complete. After all NFA, comes DFA conversion for
   * root grammar then code gen for root grammar. DFA and code gen for delegates comes next.
   */
  protected void generateRecognizer(Grammar grammar) {
    String language = (String) grammar.getOption("language");
    if (language != null) {
      CodeGenerator generator = new CodeGenerator(this, grammar, language);
      grammar.setCodeGenerator(generator);
      generator.setDebug(isDebug());
      generator.setProfile(isProfile());
      generator.setTrace(isTrace());

      // generate NFA early in case of crash later (for debugging)
      if (isGenerate_NFA_dot()) {
        generateNFAs(grammar);
      }

      // GENERATE CODE
      generator.genRecognizer();

      if (isGenerate_DFA_dot()) {
        generateDFAs(grammar);
      }

      List<Grammar> delegates = grammar.getDirectDelegates();
      for (int i = 0; delegates != null && i < delegates.size(); i++) {
        Grammar delegate = (Grammar) delegates.get(i);
        if (delegate != grammar) { // already processing this one
          generateRecognizer(delegate);
        }
      }
    }
  }
示例#2
0
  ATN createATN(Grammar g) {
    if (g.atn != null) return g.atn;
    semanticProcess(g);

    ParserATNFactory f = new ParserATNFactory(g);
    if (g.isLexer()) f = new LexerATNFactory((LexerGrammar) g);
    g.atn = f.createATN();

    return g.atn;
  }
示例#3
0
 protected void semanticProcess(Grammar g) {
   if (g.ast != null && !g.ast.hasErrors) {
     System.out.println(g.ast.toStringTree());
     Tool antlr = new Tool();
     SemanticPipeline sem = new SemanticPipeline(g);
     sem.process();
     if (g.getImportedGrammars() != null) { // process imported grammars (if any)
       for (Grammar imp : g.getImportedGrammars()) {
         antlr.processNonCombinedGrammar(imp, false);
       }
     }
   }
 }
示例#4
0
 public String toString() {
   return "GrammarCorpus, based on "
       + grammar.getClass()
       + "-grammar, "
       + sentences
       + " sentences ";
 }
示例#5
0
文件: Tool.java 项目: laiello/perseph
  /**
   * This method is used by all code generators to create new output files. If the outputDir set by
   * -o is not present it will be created. The final filename is sensitive to the output directory
   * and the directory where the grammar file was found. If -o is /tmp and the original grammar file
   * was foo/t.g then output files go in /tmp/foo.
   *
   * <p>The output dir -o spec takes precedence if it's absolute. E.g., if the grammar file dir is
   * absolute the output dir is given precendence. "-o /tmp /usr/lib/t.g" results in "/tmp/T.java"
   * as output (assuming t.g holds T.java).
   *
   * <p>If no -o is specified, then just write to the directory where the grammar file was found.
   *
   * <p>If outputDirectory==null then write a String.
   */
  public Writer getOutputFile(Grammar g, String fileName) throws IOException {
    if (getOutputDirectory() == null) {
      return new StringWriter();
    }
    // output directory is a function of where the grammar file lives
    // for subdir/T.g, you get subdir here.  Well, depends on -o etc...
    // But, if this is a .tokens file, then we force the output to
    // be the base output directory (or current directory if there is not a -o)
    //
    File outputDir;
    if (fileName.endsWith(CodeGenerator.VOCAB_FILE_EXTENSION)) {
      if (haveOutputDir) {
        outputDir = new File(getOutputDirectory());
      } else {
        outputDir = new File(".");
      }
    } else {
      outputDir = getOutputDirectory(g.getFileName());
    }
    File outputFile = new File(outputDir, fileName);

    if (!outputDir.exists()) {
      outputDir.mkdirs();
    }
    FileWriter fw = new FileWriter(outputFile);
    return new BufferedWriter(fw);
  }
示例#6
0
文件: Tool.java 项目: laiello/perseph
  protected void generateNFAs(Grammar g) {
    DOTGenerator dotGenerator = new DOTGenerator(g);
    Collection rules = g.getAllImportedRules();
    rules.addAll(g.getRules());

    for (Iterator itr = rules.iterator(); itr.hasNext(); ) {
      Rule r = (Rule) itr.next();
      try {
        String dot = dotGenerator.getDOT(r.startState);
        if (dot != null) {
          writeDOTFile(g, r, dot);
        }
      } catch (IOException ioe) {
        ErrorManager.error(ErrorManager.MSG_CANNOT_WRITE_FILE, ioe);
      }
    }
  }
  public int expand(Unit unit, Grammar grammar, Rule parent, List replacement) {
    Symbol whiteSymbol = grammar.declared(this);

    if (whiteSymbol == null) return -1;

    replacement.add(whiteSymbol);
    return replacement.size() - 1;
  }
示例#8
0
 List<ANTLRMessage> checkRuleDFA(String gtext, String ruleName, String expecting)
     throws Exception {
   ErrorQueue equeue = new ErrorQueue();
   Grammar g = new Grammar(gtext, equeue);
   ATN atn = createATN(g);
   ATNState s = atn.ruleToStartState[g.getRule(ruleName).index];
   if (s == null) {
     System.err.println("no such rule: " + ruleName);
     return null;
   }
   ATNState t = s.transition(0).target;
   if (!(t instanceof DecisionState)) {
     System.out.println(ruleName + " has no decision");
     return null;
   }
   DecisionState blk = (DecisionState) t;
   checkRuleDFA(g, blk, expecting);
   return equeue.all;
 }
示例#9
0
 List<Integer> getTypesFromString(Grammar g, String expecting) {
   List<Integer> expectingTokenTypes = new ArrayList<Integer>();
   if (expecting != null && !expecting.trim().equals("")) {
     for (String tname : expecting.replace(" ", "").split(",")) {
       int ttype = g.getTokenType(tname);
       expectingTokenTypes.add(ttype);
     }
   }
   return expectingTokenTypes;
 }
示例#10
0
文件: Tool.java 项目: laiello/perseph
 public void generateDFAs(Grammar g) {
   for (int d = 1; d <= g.getNumberOfDecisions(); d++) {
     DFA dfa = g.getLookaheadDFA(d);
     if (dfa == null) {
       continue; // not there for some reason, ignore
     }
     DOTGenerator dotGenerator = new DOTGenerator(g);
     String dot = dotGenerator.getDOT(dfa.startState);
     String dotFileName = g.name + "." + "dec-" + d;
     if (g.implicitLexer) {
       dotFileName = g.name + Grammar.grammarTypeToFileNameSuffix[g.type] + "." + "dec-" + d;
     }
     try {
       writeDOTFile(g, dotFileName, dot);
     } catch (IOException ioe) {
       ErrorManager.error(ErrorManager.MSG_CANNOT_GEN_DOT_FILE, dotFileName, ioe);
     }
   }
 }
示例#11
0
文件: Tool.java 项目: laiello/perseph
  /** Get a grammar mentioned on the command-line and any delegates */
  public Grammar getRootGrammar(String grammarFileName) throws IOException {
    // StringTemplate.setLintMode(true);
    // grammars mentioned on command line are either roots or single grammars.
    // create the necessary composite in case it's got delegates; even
    // single grammar needs it to get token types.
    CompositeGrammar composite = new CompositeGrammar();
    Grammar grammar = new Grammar(this, grammarFileName, composite);
    composite.setDelegationRoot(grammar);
    FileReader fr = null;
    File f = null;

    if (haveInputDir) {
      f = new File(inputDirectory, grammarFileName);
    } else {
      f = new File(grammarFileName);
    }

    // Store the location of this grammar as if we import files, we can then
    // search for imports in the same location as the original grammar as well as in
    // the lib directory.
    //
    parentGrammarDirectory = f.getParent();

    if (grammarFileName.lastIndexOf(File.separatorChar) == -1) {
      grammarOutputDirectory = ".";
    } else {
      grammarOutputDirectory =
          grammarFileName.substring(0, grammarFileName.lastIndexOf(File.separatorChar));
    }
    fr = new FileReader(f);
    BufferedReader br = new BufferedReader(fr);
    grammar.parseAndBuildAST(br);
    composite.watchNFAConversion = internalOption_watchNFAConversion;
    br.close();
    fr.close();
    return grammar;
  }
示例#12
0
  /**
   * @param previousGrammar
   * @param previousLexicon
   * @param grammar
   * @param lexicon
   * @param trainStateSetTrees
   * @return
   */
  public static double doOneEStep(
      Grammar previousGrammar,
      Lexicon previousLexicon,
      Grammar grammar,
      Lexicon lexicon,
      StateSetTreeList trainStateSetTrees,
      boolean updateOnlyLexicon,
      int unkThreshold) {
    boolean secondHalf = false;
    ArrayParser parser = new ArrayParser(previousGrammar, previousLexicon);
    double trainingLikelihood = 0;
    int n = 0;
    int nTrees = trainStateSetTrees.size();
    for (Tree<StateSet> stateSetTree : trainStateSetTrees) {
      secondHalf = (n++ > nTrees / 2.0);
      boolean noSmoothing = true, debugOutput = false;
      parser.doInsideOutsideScores(stateSetTree, noSmoothing, debugOutput); // E Step
      double ll = stateSetTree.getLabel().getIScore(0);
      ll =
          Math.log(ll)
              + (100 * stateSetTree.getLabel().getIScale()); // System.out.println(stateSetTree);
      if ((Double.isInfinite(ll) || Double.isNaN(ll))) {
        if (VERBOSE) {
          System.out.println("Training sentence " + n + " is given " + ll + " log likelihood!");
          System.out.println(
              "Root iScore "
                  + stateSetTree.getLabel().getIScore(0)
                  + " scale "
                  + stateSetTree.getLabel().getIScale());
        }
      } else {
        lexicon.trainTree(stateSetTree, -1, previousLexicon, secondHalf, noSmoothing, unkThreshold);
        if (!updateOnlyLexicon) grammar.tallyStateSetTree(stateSetTree, previousGrammar); // E Step
        trainingLikelihood += ll; // there are for some reason some sentences that are unparsable
      }
    }
    lexicon.tieRareWordStats(unkThreshold);

    // SSIE
    ((SophisticatedLexicon) lexicon).overwriteWithMaxent();

    return trainingLikelihood;
  }
示例#13
0
    public Tree<String> getBestParse(List<String> sentence) {
      // This implements the CKY algorithm
      int nEntries = sentence.size();

      // hashmap to store back rules
      HashMap<Triplet<Integer, Integer, String>, Triplet<Integer, String, String>> backHash =
          new HashMap<Triplet<Integer, Integer, String>, Triplet<Integer, String, String>>();

      // more efficient access with arrays, but must cast each time :(
      @SuppressWarnings("unchecked")
      Counter<String>[][] parseScores = (Counter<String>[][]) (new Counter[nEntries][nEntries]);

      for (int i = 0; i < nEntries; i++) {
        for (int j = 0; j < nEntries; j++) {
          parseScores[i][j] = new Counter<String>();
        }
      }

      System.out.println(sentence.toString());
      // First deal with the lexicons
      int index = 0;
      int span = 1; // All spans are 1 at the lexicon level
      for (String word : sentence) {
        for (String tag : lexicon.getAllTags()) {
          double score = lexicon.scoreTagging(word, tag);
          if (score >= 0.0) { // This lexicon may generate this word
            // We use a counter map in order to store the scores for this sentence parse.
            parseScores[index][index + span - 1].setCount(tag, score);
          }
        }
        index = index + 1;
      }

      // handle unary rules now

      // System.out.println("Lexicons found");
      boolean added = true;

      while (added) {
        added = false;
        for (index = 0; index < sentence.size(); index++) {
          // For each index+ span pair, get the counter.
          Counter<String> count = parseScores[index][index + span - 1];
          PriorityQueue<String> countAsPQ = count.asPriorityQueue();
          while (countAsPQ.hasNext()) {
            String entry = countAsPQ.next();
            // System.out.println("I am fine here!!");
            List<UnaryRule> unaryRules = grammar.getUnaryRulesByChild(entry);
            for (UnaryRule rule : unaryRules) {
              // These are the unary rules which might give rise to the above preterminal
              double prob = rule.getScore() * parseScores[index][index + span - 1].getCount(entry);
              if (prob > parseScores[index][index + span - 1].getCount(rule.parent)) {
                parseScores[index][index + span - 1].setCount(rule.parent, prob);
                backHash.put(
                    new Triplet<Integer, Integer, String>(index, index + span, rule.parent),
                    new Triplet<Integer, String, String>(-1, entry, null));
                added = true;
              }
            }
          }
        }
      }
      // System.out.println("Lexicon unaries dealt with");

      // Now work with the grammar to produce higher level probabilities
      for (span = 2; span <= sentence.size(); span++) {
        for (int begin = 0; begin <= (sentence.size() - span); begin++) {
          int end = begin + span;
          for (int split = begin + 1; split <= end - 1; split++) {
            Counter<String> countLeft = parseScores[begin][split - 1];
            Counter<String> countRight = parseScores[split][end - 1];
            // List<BinaryRule> leftRules= new ArrayList<BinaryRule>();
            HashMap<Integer, BinaryRule> leftMap = new HashMap<Integer, BinaryRule>();
            // List<BinaryRule> rightRules=new ArrayList<BinaryRule>();
            HashMap<Integer, BinaryRule> rightMap = new HashMap<Integer, BinaryRule>();

            for (String entry : countLeft.keySet()) {
              for (BinaryRule rule : grammar.getBinaryRulesByLeftChild(entry)) {
                if (!leftMap.containsKey(rule.hashCode())) {
                  leftMap.put(rule.hashCode(), rule);
                }
              }
            }

            for (String entry : countRight.keySet()) {
              for (BinaryRule rule : grammar.getBinaryRulesByRightChild(entry)) {
                if (!rightMap.containsKey(rule.hashCode())) {
                  rightMap.put(rule.hashCode(), rule);
                }
              }
            }

            // System.out.println("About to enter the rules loops");
            for (Integer ruleHash : leftMap.keySet()) {
              if (rightMap.containsKey(ruleHash)) {
                BinaryRule ruleRight = rightMap.get(ruleHash);
                double prob =
                    ruleRight.getScore()
                        * parseScores[begin][split - 1].getCount(ruleRight.leftChild)
                        * parseScores[split][end - 1].getCount(ruleRight.rightChild);
                // System.out.println(begin+" "+ end +" "+ ruleRight.parent+ " "+ prob);
                if (prob > parseScores[begin][end - 1].getCount(ruleRight.parent)) {
                  // System.out.println(begin+" "+ end +" "+ ruleRight.parent+ " "+ prob);
                  // System.out.println("parentrule :"+ ruleRight.getParent());
                  parseScores[begin][end - 1].setCount(ruleRight.getParent(), prob);
                  backHash.put(
                      new Triplet<Integer, Integer, String>(begin, end, ruleRight.parent),
                      new Triplet<Integer, String, String>(
                          split, ruleRight.leftChild, ruleRight.rightChild));
                }
              }
            }

            // System.out.println("Exited rules loop");

          }
          // System.out.println("Grammar found for " + begin + " "+ end);
          // Now handle unary rules
          added = true;
          while (added) {
            added = false;
            Counter<String> count = parseScores[begin][end - 1];
            PriorityQueue<String> countAsPriorityQueue = count.asPriorityQueue();
            while (countAsPriorityQueue.hasNext()) {
              String entry = countAsPriorityQueue.next();
              List<UnaryRule> unaryRules = grammar.getUnaryRulesByChild(entry);
              for (UnaryRule rule : unaryRules) {
                double prob = rule.getScore() * parseScores[begin][end - 1].getCount(entry);
                if (prob > parseScores[begin][end - 1].getCount(rule.parent)) {
                  parseScores[begin][end - 1].setCount(rule.parent, prob);

                  backHash.put(
                      new Triplet<Integer, Integer, String>(begin, end, rule.parent),
                      new Triplet<Integer, String, String>(-1, entry, null));
                  added = true;
                }
              }
            }
          }

          // System.out.println("Unaries dealt for " + begin + " "+ end);

        }
      }

      // Create and return the parse tree
      Tree<String> parseTree = new Tree<String>("null");
      // System.out.println(parseScores.getCounter(0+" "+sentence.size()).toString());

      // Pick the argmax
      String parent = parseScores[0][nEntries - 1].argMax();

      // Or pick root. This second one is preferred since sentences are meant to have ROOT as their
      // root node.
      parent = "ROOT";
      parseTree = getParseTree(sentence, backHash, 0, sentence.size(), parent);
      // System.out.println("PARSE SCORES");
      //	System.out.println(parseScores.toString());
      // System.out.println("BACK HASH");
      // System.out.println(backHash.toString());
      //	parseTree = addRoot(parseTree);
      // System.out.println(parseTree.toString());
      // return parseTree;
      return TreeAnnotations.unAnnotateTree(parseTree);
    }
  private Regexp internalGetBoundedRegexp(Grammar g, String startSymbol, int bound) {
    if (regexpCache.containsKeys(startSymbol, bound)) return regexpCache.get(startSymbol, bound);

    // removing epsilons may not succeed for start symbol. We check if that's the case.
    boolean canGenerateEmptyString = g.containsEpsilonProduction(startSymbol);

    List<Regexp> x = new ArrayList<Regexp>();
    for (GrammarProduction prod : g.getRule(startSymbol).getProductions()) {
      List<GrammarProductionElement> elems = prod.getElements();
      if (canGenerateEmptyString
          || elems.size()
              <= bound) { // uses the fact that every symbol (other than start) produces at least
                          // one terminal
        List<List<Integer>> distros = new ArrayList<List<Integer>>(distributions(bound, elems));

        distrosLoop:
        for (int j = 0; j < distros.size(); j++) {
          List<Integer> distro = distros.get(j);
          Regexp[] exps = new Regexp[distro.size()];
          for (int i = 0; i < elems.size(); i++) {
            GrammarProductionElement elem = elems.get(i);
            int sizeForElem = distro.get(i);
            if (terms_are_single_char) {
              if (sizeForElem > 1
                  && (elem.getKind() == GrammarElementKind.GTERMINAL
                      || elem.getKind() == GrammarElementKind.GSPECIAL)) {
                continue
                    distrosLoop; // no way you can generate a string longer than 1 from a terminal
              }
              if (sizeForElem == 1 && elem.getKind() == GrammarElementKind.GTERMINAL) {
                TerminalElement te = (TerminalElement) elem;
                exps[i] = HampiConstraints.constRegexp(te.getNameNoQuotes());
              } else if (sizeForElem == 1 && elem.getKind() == GrammarElementKind.GSPECIAL) {
                SpecialElement spec = (SpecialElement) elem;
                exps[i] = HampiConstraints.constRegexp(spec.getNameNoDelimiters());
              } else if (elem.getKind() == GrammarElementKind.GNONTERMINAL) {
                NonterminalElement nt = (NonterminalElement) elem;
                if (bounds.containsKey(nt)
                    && bounds.get(nt)
                        < sizeForElem) { // cannot generate a string longer than the upper bound on
                                         // all strings generatable from the nonterminal
                  continue distrosLoop;
                }
                Regexp subRegexp = internalGetBoundedRegexp(g, nt.getName(), sizeForElem);
                if (subRegexp != null) {
                  exps[i] = subRegexp;
                } else {
                  continue distrosLoop;
                }
              } else throw new IllegalStateException("expected a nonterminal or special" + elem);
            } else {
              if (elem.getKind() == GrammarElementKind.GSPECIAL)
                throw new UnsupportedOperationException("not implemented yet");
              if (elem.getKind() == GrammarElementKind.GTERMINAL) {
                TerminalElement term = (TerminalElement) elem;
                if (term.getNameNoQuotes().length() != sizeForElem) {
                  continue distrosLoop; // no way you can generate a string this long
                } else {
                  exps[i] = HampiConstraints.constRegexp(term.getNameNoQuotes());
                }
              } else if (elem.getKind() == GrammarElementKind.GNONTERMINAL) {
                NonterminalElement nt = (NonterminalElement) elem;
                if (bounds.containsKey(nt)
                    && bounds.get(nt)
                        < sizeForElem) { // cannot generate a string longer than the upper bound on
                                         // all strings generatable from the nonterminal
                  continue distrosLoop;
                }
                Regexp subRegexp = internalGetBoundedRegexp(g, nt.getName(), sizeForElem);
                if (subRegexp != null) {
                  exps[i] = subRegexp;
                } else {
                  continue distrosLoop;
                }
              } else throw new IllegalStateException("expected a nonterminal or special" + elem);
            }
          }
          Regexp e;
          if (exps.length == 1) {
            e = exps[0];
          } else {
            e = HampiConstraints.concatRegexp(exps);
          }
          if (!x.contains(e)) {
            x.add(e);
          }
        }
      }
    }

    Regexp result;
    if (x.isEmpty() && !canGenerateEmptyString) {
      result = null;
    } else if (x.isEmpty() && canGenerateEmptyString) {
      Hampi h = new Hampi();
      result = h.constRegexp("");
    } else if (x.size() == 1) {
      result = x.get(0);
    } else {
      Hampi h = new Hampi();
      result = h.orRegexp(x.toArray(new Regexp[x.size()]));
    }
    regexpCache.put(startSymbol, bound, result);
    return result;
  }
示例#15
0
  public static void main(String[] args) {
    OptionParser optParser = new OptionParser(Options.class);
    Options opts = (Options) optParser.parse(args, true);
    // provide feedback on command-line arguments
    System.out.println("Calling with " + optParser.getPassedInOptions());

    String path = opts.path;
    //    int lang = opts.lang;
    System.out.println("Loading trees from " + path + " and using language " + opts.treebank);

    double trainingFractionToKeep = opts.trainingFractionToKeep;

    int maxSentenceLength = opts.maxSentenceLength;
    System.out.println("Will remove sentences with more than " + maxSentenceLength + " words.");

    HORIZONTAL_MARKOVIZATION = opts.horizontalMarkovization;
    VERTICAL_MARKOVIZATION = opts.verticalMarkovization;
    System.out.println(
        "Using horizontal="
            + HORIZONTAL_MARKOVIZATION
            + " and vertical="
            + VERTICAL_MARKOVIZATION
            + " markovization.");

    Binarization binarization = opts.binarization;
    System.out.println(
        "Using " + binarization.name() + " binarization."); // and "+annotateString+".");

    double randomness = opts.randomization;
    System.out.println("Using a randomness value of " + randomness);

    String outFileName = opts.outFileName;
    if (outFileName == null) {
      System.out.println("Output File name is required.");
      System.exit(-1);
    } else System.out.println("Using grammar output file " + outFileName + ".");

    VERBOSE = opts.verbose;
    RANDOM = new Random(opts.randSeed);
    System.out.println("Random number generator seeded at " + opts.randSeed + ".");

    boolean manualAnnotation = false;
    boolean baseline = opts.baseline;
    boolean noSplit = opts.noSplit;
    int numSplitTimes = opts.numSplits;
    if (baseline) numSplitTimes = 0;
    String splitGrammarFile = opts.inFile;
    int allowedDroppingIters = opts.di;

    int maxIterations = opts.splitMaxIterations;
    int minIterations = opts.splitMinIterations;
    if (minIterations > 0)
      System.out.println("I will do at least " + minIterations + " iterations.");

    double[] smoothParams = {opts.smoothingParameter1, opts.smoothingParameter2};
    System.out.println("Using smoothing parameters " + smoothParams[0] + " and " + smoothParams[1]);

    boolean allowMoreSubstatesThanCounts = false;
    boolean findClosedUnaryPaths = opts.findClosedUnaryPaths;

    Corpus corpus =
        new Corpus(
            path,
            opts.treebank,
            trainingFractionToKeep,
            false,
            opts.skipSection,
            opts.skipBilingual);
    List<Tree<String>> trainTrees =
        Corpus.binarizeAndFilterTrees(
            corpus.getTrainTrees(),
            VERTICAL_MARKOVIZATION,
            HORIZONTAL_MARKOVIZATION,
            maxSentenceLength,
            binarization,
            manualAnnotation,
            VERBOSE);
    List<Tree<String>> validationTrees =
        Corpus.binarizeAndFilterTrees(
            corpus.getValidationTrees(),
            VERTICAL_MARKOVIZATION,
            HORIZONTAL_MARKOVIZATION,
            maxSentenceLength,
            binarization,
            manualAnnotation,
            VERBOSE);
    Numberer tagNumberer = Numberer.getGlobalNumberer("tags");

    //		for (Tree<String> t : trainTrees){
    //				System.out.println(t);
    //			}

    if (opts.trainOnDevSet) {
      System.out.println("Adding devSet to training data.");
      trainTrees.addAll(validationTrees);
    }

    if (opts.lowercase) {
      System.out.println("Lowercasing the treebank.");
      Corpus.lowercaseWords(trainTrees);
      Corpus.lowercaseWords(validationTrees);
    }

    int nTrees = trainTrees.size();

    System.out.println("There are " + nTrees + " trees in the training set.");

    double filter = opts.filter;
    if (filter > 0)
      System.out.println(
          "Will remove rules with prob under "
              + filter
              + ".\nEven though only unlikely rules are pruned the training LL is not guaranteed to increase in every round anymore "
              + "(especially when we are close to converging)."
              + "\nFurthermore it increases the variance because 'good' rules can be pruned away in early stages.");

    short nSubstates = opts.nSubStates;
    short[] numSubStatesArray =
        initializeSubStateArray(trainTrees, validationTrees, tagNumberer, nSubstates);
    if (baseline) {
      short one = 1;
      Arrays.fill(numSubStatesArray, one);
      System.out.println("Training just the baseline grammar (1 substate for all states)");
      randomness = 0.0f;
    }

    if (VERBOSE) {
      for (int i = 0; i < numSubStatesArray.length; i++) {
        System.out.println("Tag " + (String) tagNumberer.object(i) + " " + i);
      }
    }

    System.out.println("There are " + numSubStatesArray.length + " observed categories.");

    // initialize lexicon and grammar
    Lexicon lexicon = null, maxLexicon = null, previousLexicon = null;
    Grammar grammar = null, maxGrammar = null, previousGrammar = null;
    double maxLikelihood = Double.NEGATIVE_INFINITY;

    //    String smootherStr = opts.smooth;
    //    Smoother lexiconSmoother = null;
    //    Smoother grammarSmoother = null;
    //    if (splitGrammarFile!=null){
    //    	lexiconSmoother = maxLexicon.smoother;
    //    	grammarSmoother = maxGrammar.smoother;
    //    	System.out.println("Using smoother from input grammar.");
    //    }
    //    else if (smootherStr.equals("NoSmoothing"))
    //    	lexiconSmoother = grammarSmoother = new NoSmoothing();
    //    else if (smootherStr.equals("SmoothAcrossParentBits")) {
    //    	lexiconSmoother = grammarSmoother  = new SmoothAcrossParentBits(grammarSmoothing,
    // maxGrammar.splitTrees);
    //    }
    //    else
    //    	throw new Error("I didn't understand the type of smoother '"+smootherStr+"'");
    //    System.out.println("Using smoother "+smootherStr);

    // EM: iterate until the validation likelihood drops for four consecutive
    // iterations
    int iter = 0;
    int droppingIter = 0;

    //  If we are splitting, we load the old grammar and start off by splitting.
    int startSplit = 0;
    if (splitGrammarFile != null) {
      System.out.println("Loading old grammar from " + splitGrammarFile);
      startSplit = 1; // we've already trained the grammar
      ParserData pData = ParserData.Load(splitGrammarFile);
      maxGrammar = pData.gr;
      maxLexicon = pData.lex;
      numSubStatesArray = maxGrammar.numSubStates;
      previousGrammar = grammar = maxGrammar;
      previousLexicon = lexicon = maxLexicon;
      Numberer.setNumberers(pData.getNumbs());
      tagNumberer = Numberer.getGlobalNumberer("tags");
      System.out.println("Loading old grammar complete.");
      if (noSplit) {
        System.out.println("Will NOT split the loaded grammar.");
        startSplit = 0;
      }
    }

    double mergingPercentage = opts.mergingPercentage;
    boolean separateMergingThreshold = opts.separateMergingThreshold;
    if (mergingPercentage > 0) {
      System.out.println(
          "Will merge " + (int) (mergingPercentage * 100) + "% of the splits in each round.");
      System.out.println(
          "The threshold for merging lexical and phrasal categories will be set separately: "
              + separateMergingThreshold);
    }

    StateSetTreeList trainStateSetTrees =
        new StateSetTreeList(trainTrees, numSubStatesArray, false, tagNumberer);
    StateSetTreeList validationStateSetTrees =
        new StateSetTreeList(validationTrees, numSubStatesArray, false, tagNumberer); // deletePC);

    // get rid of the old trees
    trainTrees = null;
    validationTrees = null;
    corpus = null;
    System.gc();

    if (opts.simpleLexicon) {
      System.out.println(
          "Replacing words which have been seen less than 5 times with their signature.");
      Corpus.replaceRareWords(
          trainStateSetTrees, new SimpleLexicon(numSubStatesArray, -1), opts.rare);
    }

    // If we're training without loading a split grammar, then we run once without splitting.
    if (splitGrammarFile == null) {
      grammar =
          new Grammar(numSubStatesArray, findClosedUnaryPaths, new NoSmoothing(), null, filter);
      Lexicon tmp_lexicon =
          (opts.simpleLexicon)
              ? new SimpleLexicon(
                  numSubStatesArray,
                  -1,
                  smoothParams,
                  new NoSmoothing(),
                  filter,
                  trainStateSetTrees)
              : new SophisticatedLexicon(
                  numSubStatesArray,
                  SophisticatedLexicon.DEFAULT_SMOOTHING_CUTOFF,
                  smoothParams,
                  new NoSmoothing(),
                  filter);
      int n = 0;
      boolean secondHalf = false;
      for (Tree<StateSet> stateSetTree : trainStateSetTrees) {
        secondHalf = (n++ > nTrees / 2.0);
        tmp_lexicon.trainTree(stateSetTree, randomness, null, secondHalf, false, opts.rare);
      }
      lexicon =
          (opts.simpleLexicon)
              ? new SimpleLexicon(
                  numSubStatesArray,
                  -1,
                  smoothParams,
                  new NoSmoothing(),
                  filter,
                  trainStateSetTrees)
              : new SophisticatedLexicon(
                  numSubStatesArray,
                  SophisticatedLexicon.DEFAULT_SMOOTHING_CUTOFF,
                  smoothParams,
                  new NoSmoothing(),
                  filter);
      for (Tree<StateSet> stateSetTree : trainStateSetTrees) {
        secondHalf = (n++ > nTrees / 2.0);
        lexicon.trainTree(stateSetTree, randomness, tmp_lexicon, secondHalf, false, opts.rare);
        grammar.tallyUninitializedStateSetTree(stateSetTree);
      }
      lexicon.tieRareWordStats(opts.rare);
      lexicon.optimize();

      // SSIE
      ((SophisticatedLexicon) lexicon).overwriteWithMaxent();

      grammar.optimize(randomness);
      // System.out.println(grammar);
      previousGrammar = maxGrammar = grammar; // needed for baseline - when there is no EM loop
      previousLexicon = maxLexicon = lexicon;
    }

    // the main loop: split and train the grammar
    for (int splitIndex = startSplit; splitIndex < numSplitTimes * 3; splitIndex++) {

      // now do either a merge or a split and the end a smooth
      // on odd iterations merge, on even iterations split
      String opString = "";
      if (splitIndex % 3 == 2) { // (splitIndex==numSplitTimes*2){
        if (opts.smooth.equals("NoSmoothing")) continue;
        System.out.println("Setting smoother for grammar and lexicon.");
        Smoother grSmoother = new SmoothAcrossParentBits(0.01, maxGrammar.splitTrees);
        Smoother lexSmoother = new SmoothAcrossParentBits(0.1, maxGrammar.splitTrees);
        //        Smoother grSmoother = new SmoothAcrossParentSubstate(0.01);
        //        Smoother lexSmoother = new SmoothAcrossParentSubstate(0.1);
        maxGrammar.setSmoother(grSmoother);
        maxLexicon.setSmoother(lexSmoother);
        minIterations = maxIterations = opts.smoothMaxIterations;
        opString = "smoothing";
      } else if (splitIndex % 3 == 0) {
        // the case where we split
        if (opts.noSplit) continue;
        System.out.println(
            "Before splitting, we have a total of " + maxGrammar.totalSubStates() + " substates.");
        CorpusStatistics corpusStatistics = new CorpusStatistics(tagNumberer, trainStateSetTrees);
        int[] counts = corpusStatistics.getSymbolCounts();

        maxGrammar = maxGrammar.splitAllStates(randomness, counts, allowMoreSubstatesThanCounts, 0);
        maxLexicon = maxLexicon.splitAllStates(counts, allowMoreSubstatesThanCounts, 0);
        Smoother grSmoother = new NoSmoothing();
        Smoother lexSmoother = new NoSmoothing();
        maxGrammar.setSmoother(grSmoother);
        maxLexicon.setSmoother(lexSmoother);
        System.out.println(
            "After splitting, we have a total of " + maxGrammar.totalSubStates() + " substates.");
        System.out.println(
            "Rule probabilities are NOT normalized in the split, therefore the training LL is not guaranteed to improve between iteration 0 and 1!");
        opString = "splitting";
        maxIterations = opts.splitMaxIterations;
        minIterations = opts.splitMinIterations;
      } else {
        if (mergingPercentage == 0) continue;
        // the case where we merge
        double[][] mergeWeights =
            GrammarMerger.computeMergeWeights(maxGrammar, maxLexicon, trainStateSetTrees);
        double[][][] deltas =
            GrammarMerger.computeDeltas(maxGrammar, maxLexicon, mergeWeights, trainStateSetTrees);
        boolean[][][] mergeThesePairs =
            GrammarMerger.determineMergePairs(
                deltas, separateMergingThreshold, mergingPercentage, maxGrammar);

        grammar = GrammarMerger.doTheMerges(maxGrammar, maxLexicon, mergeThesePairs, mergeWeights);
        short[] newNumSubStatesArray = grammar.numSubStates;
        trainStateSetTrees = new StateSetTreeList(trainStateSetTrees, newNumSubStatesArray, false);
        validationStateSetTrees =
            new StateSetTreeList(validationStateSetTrees, newNumSubStatesArray, false);

        // retrain lexicon to finish the lexicon merge (updates the unknown words model)...
        lexicon =
            (opts.simpleLexicon)
                ? new SimpleLexicon(
                    newNumSubStatesArray,
                    -1,
                    smoothParams,
                    maxLexicon.getSmoother(),
                    filter,
                    trainStateSetTrees)
                : new SophisticatedLexicon(
                    newNumSubStatesArray,
                    SophisticatedLexicon.DEFAULT_SMOOTHING_CUTOFF,
                    maxLexicon.getSmoothingParams(),
                    maxLexicon.getSmoother(),
                    maxLexicon.getPruningThreshold());
        boolean updateOnlyLexicon = true;
        double trainingLikelihood =
            GrammarTrainer.doOneEStep(
                grammar,
                maxLexicon,
                null,
                lexicon,
                trainStateSetTrees,
                updateOnlyLexicon,
                opts.rare);
        //    		System.out.println("The training LL is "+trainingLikelihood);
        lexicon
            .optimize(); // Grammar.RandomInitializationType.INITIALIZE_WITH_SMALL_RANDOMIZATION);
                         // // M Step

        GrammarMerger.printMergingStatistics(maxGrammar, grammar);
        opString = "merging";
        maxGrammar = grammar;
        maxLexicon = lexicon;
        maxIterations = opts.mergeMaxIterations;
        minIterations = opts.mergeMinIterations;
      }
      // update the substate dependent objects
      previousGrammar = grammar = maxGrammar;
      previousLexicon = lexicon = maxLexicon;
      droppingIter = 0;
      numSubStatesArray = grammar.numSubStates;
      trainStateSetTrees = new StateSetTreeList(trainStateSetTrees, numSubStatesArray, false);
      validationStateSetTrees =
          new StateSetTreeList(validationStateSetTrees, numSubStatesArray, false);
      maxLikelihood = calculateLogLikelihood(maxGrammar, maxLexicon, validationStateSetTrees);
      System.out.println(
          "After "
              + opString
              + " in the "
              + (splitIndex / 3 + 1)
              + "th round, we get a validation likelihood of "
              + maxLikelihood);
      iter = 0;

      // the inner loop: train the grammar via EM until validation likelihood reliably drops
      do {
        iter += 1;
        System.out.println("Beginning iteration " + (iter - 1) + ":");

        // 1) Compute the validation likelihood of the previous iteration
        System.out.print("Calculating validation likelihood...");
        double validationLikelihood =
            calculateLogLikelihood(
                previousGrammar,
                previousLexicon,
                validationStateSetTrees); // The validation LL of previousGrammar/previousLexicon
        System.out.println("done: " + validationLikelihood);

        // 2) Perform the E step while computing the training likelihood of the previous iteration
        System.out.print("Calculating training likelihood...");
        grammar =
            new Grammar(
                grammar.numSubStates,
                grammar.findClosedPaths,
                grammar.smoother,
                grammar,
                grammar.threshold);
        lexicon =
            (opts.simpleLexicon)
                ? new SimpleLexicon(
                    grammar.numSubStates,
                    -1,
                    smoothParams,
                    lexicon.getSmoother(),
                    filter,
                    trainStateSetTrees)
                : new SophisticatedLexicon(
                    grammar.numSubStates,
                    SophisticatedLexicon.DEFAULT_SMOOTHING_CUTOFF,
                    lexicon.getSmoothingParams(),
                    lexicon.getSmoother(),
                    lexicon.getPruningThreshold());
        boolean updateOnlyLexicon = false;
        double trainingLikelihood =
            doOneEStep(
                previousGrammar,
                previousLexicon,
                grammar,
                lexicon,
                trainStateSetTrees,
                updateOnlyLexicon,
                opts.rare); // The training LL of previousGrammar/previousLexicon
        System.out.println("done: " + trainingLikelihood);

        // 3) Perform the M-Step
        lexicon.optimize(); // M Step
        grammar.optimize(0); // M Step

        // 4) Check whether previousGrammar/previousLexicon was in fact better than the best
        if (iter < minIterations || validationLikelihood >= maxLikelihood) {
          maxLikelihood = validationLikelihood;
          maxGrammar = previousGrammar;
          maxLexicon = previousLexicon;
          droppingIter = 0;
        } else {
          droppingIter++;
        }

        // 5) advance the 'pointers'
        previousGrammar = grammar;
        previousLexicon = lexicon;
      } while ((droppingIter < allowedDroppingIters) && (!baseline) && (iter < maxIterations));

      // Dump a grammar file to disk from time to time
      ParserData pData =
          new ParserData(
              maxLexicon,
              maxGrammar,
              null,
              Numberer.getNumberers(),
              numSubStatesArray,
              VERTICAL_MARKOVIZATION,
              HORIZONTAL_MARKOVIZATION,
              binarization);
      String outTmpName = outFileName + "_" + (splitIndex / 3 + 1) + "_" + opString + ".gr";
      System.out.println("Saving grammar to " + outTmpName + ".");
      if (pData.Save(outTmpName)) System.out.println("Saving successful.");
      else System.out.println("Saving failed!");
      pData = null;
    }

    // The last grammar/lexicon has not yet been evaluated. Even though the validation likelihood
    // has been dropping in the past few iteration, there is still a chance that the last one was in
    // fact the best so just in case we evaluate it.
    System.out.print("Calculating last validation likelihood...");
    double validationLikelihood = calculateLogLikelihood(grammar, lexicon, validationStateSetTrees);
    System.out.println(
        "done.\n  Iteration "
            + iter
            + " (final) gives validation likelihood "
            + validationLikelihood);
    if (validationLikelihood > maxLikelihood) {
      maxLikelihood = validationLikelihood;
      maxGrammar = previousGrammar;
      maxLexicon = previousLexicon;
    }

    ParserData pData =
        new ParserData(
            maxLexicon,
            maxGrammar,
            null,
            Numberer.getNumberers(),
            numSubStatesArray,
            VERTICAL_MARKOVIZATION,
            HORIZONTAL_MARKOVIZATION,
            binarization);

    System.out.println("Saving grammar to " + outFileName + ".");
    System.out.println("It gives a validation data log likelihood of: " + maxLikelihood);
    if (pData.Save(outFileName)) System.out.println("Saving successful.");
    else System.out.println("Saving failed!");

    System.exit(0);
  }
    public Tree<String> getBestParse(List<String> sentence) {
      // TODO: implement this method
      int n = sentence.size();

      // System.out.println("getBestParse: n=" + n);

      List<List<Map<Object, Double>>> scores = new ArrayList<List<Map<Object, Double>>>(n + 1);
      for (int i = 0; i < n + 1; i++) {
        List<Map<Object, Double>> row = new ArrayList<Map<Object, Double>>(n + 1);
        for (int j = 0; j < n + 1; j++) {
          row.add(new HashMap<Object, Double>());
        }
        scores.add(row);
      }
      List<List<Map<Object, Triplet<Integer, Object, Object>>>> backs =
          new ArrayList<List<Map<Object, Triplet<Integer, Object, Object>>>>(n + 1);
      for (int i = 0; i < n + 1; i++) {
        List<Map<Object, Triplet<Integer, Object, Object>>> row =
            new ArrayList<Map<Object, Triplet<Integer, Object, Object>>>(n + 1);
        for (int j = 0; j < n + 1; j++) {
          row.add(new HashMap<Object, Triplet<Integer, Object, Object>>());
        }
        backs.add(row);
      }

      /*
      System.out.println("scores=" + scores.size() + "x" + scores.get(0).size());
      System.out.println("backs=" + backs.size() + "x" + backs.get(0).size());
      printChart(scores, backs, "scores");
      */
      // First the Lexicon

      for (int i = 0; i < n; i++) {
        String word = sentence.get(i);
        for (String tag : lexicon.getAllTags()) {
          UnaryRule A = new UnaryRule(tag, word);
          A.setScore(Math.log(lexicon.scoreTagging(word, tag)));
          scores.get(i).get(i + 1).put(A, A.getScore());
          backs.get(i).get(i + 1).put(A, null);
        }

        // System.out.println("Starting unaries: i=" + i + ",n=" + n );
        // Handle unaries
        boolean added = true;
        while (added) {
          added = false;
          Map<Object, Double> A_scores = scores.get(i).get(i + 1);
          // Don't modify the dict we are iterating
          List<Object> A_keys = copyKeys(A_scores);
          // for (int j = 0; j < 5 && j < A_keys.size(); j++) {
          //	System.out.print("," + j + "=" + A_scores.get(A_keys.get(j)));
          // }

          for (Object oB : A_keys) {
            UnaryRule B = (UnaryRule) oB;
            for (UnaryRule A : grammar.getUnaryRulesByChild(B.getParent())) {
              double prob = Math.log(A.getScore()) + A_scores.get(B);
              if (prob > -1000.0) {

                if (!A_scores.containsKey(A) || prob > A_scores.get(A)) {
                  // System.out.print(" *A=" + A + ", B=" + B);
                  // System.out.print(",  prob=" +  prob);
                  // System.out.println(",  A_scores.get(A)=" +  A_scores.get(A));
                  A_scores.put(A, prob);
                  backs.get(i).get(i + 1).put(A, new Triplet<Integer, Object, Object>(-1, B, null));
                  added = true;
                }
                // System.out.println(", added=" + added);
              }
            }
          }
          // System.out.println(", A_scores=" + A_scores.size() + ", added=" + added);
        }
      }

      // printChart(scores, backs, "scores with Lexicon");

      // Do higher layers
      // Naming is based on rules: A -> B,C

      long startTime = new Date().getTime();
      for (int span = 2; span < n + 1; span++) {

        for (int begin = 0; begin < n + 1 - span; begin++) {
          int end = begin + span;

          Map<Object, Double> A_scores = scores.get(begin).get(end);
          Map<Object, Triplet<Integer, Object, Object>> A_backs = backs.get(begin).get(end);

          for (int split = begin + 1; split < end; split++) {

            Map<Object, Double> B_scores = scores.get(begin).get(split);
            Map<Object, Double> C_scores = scores.get(split).get(end);

            List<Object> B_list = new ArrayList<Object>(B_scores.keySet());
            List<Object> C_list = new ArrayList<Object>(C_scores.keySet());

            // This is a key optimization. !@#$
            // It avoids a B_list.size() x C_list.size() search in the for (Object B : B_list) loop
            Map<String, List<Object>> C_map = new HashMap<String, List<Object>>();
            for (Object C : C_list) {
              String parent = getParent(C);
              if (!C_map.containsKey(parent)) {
                C_map.put(parent, new ArrayList<Object>());
              }
              C_map.get(parent).add(C);
            }

            for (Object B : B_list) {
              for (BinaryRule A : grammar.getBinaryRulesByLeftChild(getParent(B))) {
                if (C_map.containsKey(A.getRightChild())) {
                  for (Object C : C_map.get(A.getRightChild())) {
                    // We now have A which has B as left child and C as right child
                    double prob = Math.log(A.getScore()) + B_scores.get(B) + C_scores.get(C);
                    if (!A_scores.containsKey(A) || prob > A_scores.get(A)) {
                      A_scores.put(A, prob);
                      A_backs.put(A, new Triplet<Integer, Object, Object>(split, B, C));
                    }
                  }
                }
              }
            }
          }

          // Handle unaries: A -> B
          boolean added = true;
          while (added) {
            added = false;
            // Don't modify the dict we are iterating
            List<Object> A_keys = copyKeys(A_scores);
            for (Object oB : A_keys) {
              for (UnaryRule A : grammar.getUnaryRulesByChild(getParent(oB))) {
                double prob = Math.log(A.getScore()) + A_scores.get(oB);
                if (!A_scores.containsKey(A) || prob > A_scores.get(A)) {
                  A_scores.put(A, prob);
                  A_backs.put(A, new Triplet<Integer, Object, Object>(-1, oB, null));
                  added = true;
                }
              }
            }
          }
        }
      }

      // printChart(scores, backs, "scores with Lexicon and Grammar");

      Map<Object, Double> topOfChart = scores.get(0).get(n);

      System.out.println("topOfChart: " + topOfChart.size());
      /*
      for (Object o: topOfChart.keySet()) {
          System.out.println("o=" + o + ", score=" + topOfChart.getCount(o));
      }
      */

      // All parses have "ROOT" at top of tree
      Object bestKey = null;
      Object secondBestKey = null;
      double bestScore = Double.NEGATIVE_INFINITY;
      double secondBestScore = Double.NEGATIVE_INFINITY;
      for (Object key : topOfChart.keySet()) {
        double score = topOfChart.get(key);
        if (score >= secondBestScore || secondBestKey == null) {
          secondBestKey = key;
          secondBestScore = score;
        }
        if ("ROOT".equals(getParent(key)) && (score >= bestScore || bestKey == null)) {
          bestKey = key;
          bestScore = score;
        }
      }

      if (bestKey == null) {
        bestKey = secondBestKey;
        System.out.println("secondBestKey=" + secondBestKey);
      }
      if (bestKey == null) {
        for (Object key : topOfChart.keySet()) {
          System.out.println("val=" + topOfChart.get(key) + ", key=" + key);
        }
      }
      System.out.println("bestKey=" + bestKey + ", log(prob)=" + topOfChart.get(bestKey));

      Tree<String> result = makeTree(backs, 0, n, bestKey);
      if (!"ROOT".equals(result.getLabel())) {
        List<Tree<String>> children = new ArrayList<Tree<String>>();
        children.add(result);
        result = new Tree<String>("ROOT", children); // !@#$
      }

      /*
      System.out.println("==================================================");
      System.out.println(result);
      System.out.println("====================^^^^^^========================");
      */
      return TreeAnnotations.unAnnotateTree(result);
    }
示例#17
0
    public Tree<String> getBestParseOld(List<String> sentence) {
      // TODO: This implements the CKY algorithm

      CounterMap<String, String> parseScores = new CounterMap<String, String>();

      System.out.println(sentence.toString());
      // First deal with the lexicons
      int index = 0;
      int span = 1; // All spans are 1 at the lexicon level
      for (String word : sentence) {
        for (String tag : lexicon.getAllTags()) {
          double score = lexicon.scoreTagging(word, tag);
          if (score >= 0.0) { // This lexicon may generate this word
            // We use a counter map in order to store the scores for this sentence parse.
            parseScores.setCount(index + " " + (index + span), tag, score);
          }
        }
        index = index + 1;
      }

      // handle unary rules now
      HashMap<String, Triplet<Integer, String, String>> backHash =
          new HashMap<
              String, Triplet<Integer, String, String>>(); // hashmap to store back propation

      // System.out.println("Lexicons found");
      Boolean added = true;

      while (added) {
        added = false;
        for (index = 0; index < sentence.size(); index++) {
          // For each index+ span pair, get the counter.
          Counter<String> count = parseScores.getCounter(index + " " + (index + span));
          PriorityQueue<String> countAsPQ = count.asPriorityQueue();
          while (countAsPQ.hasNext()) {
            String entry = countAsPQ.next();
            // System.out.println("I am fine here!!");
            List<UnaryRule> unaryRules = grammar.getUnaryRulesByChild(entry);
            for (UnaryRule rule : unaryRules) {
              // These are the unary rules which might give rise to the above preterminal
              double prob =
                  rule.getScore() * parseScores.getCount(index + " " + (index + span), entry);
              if (prob > parseScores.getCount(index + " " + (index + span), rule.parent)) {
                parseScores.setCount(index + " " + (index + span), rule.parent, prob);
                backHash.put(
                    index + " " + (index + span) + " " + rule.parent,
                    new Triplet<Integer, String, String>(-1, entry, null));
                added = true;
              }
            }
          }
        }
      }
      // System.out.println("Lexicon unaries dealt with");

      // Now work with the grammar to produce higher level probabilities
      for (span = 2; span <= sentence.size(); span++) {
        for (int begin = 0; begin <= (sentence.size() - span); begin++) {
          int end = begin + span;
          for (int split = begin + 1; split <= end - 1; split++) {
            Counter<String> countLeft = parseScores.getCounter(begin + " " + split);
            Counter<String> countRight = parseScores.getCounter(split + " " + end);
            // List<BinaryRule> leftRules= new ArrayList<BinaryRule>();
            HashMap<Integer, BinaryRule> leftMap = new HashMap<Integer, BinaryRule>();
            // List<BinaryRule> rightRules=new ArrayList<BinaryRule>();
            HashMap<Integer, BinaryRule> rightMap = new HashMap<Integer, BinaryRule>();

            for (String entry : countLeft.keySet()) {
              for (BinaryRule rule : grammar.getBinaryRulesByLeftChild(entry)) {
                if (!leftMap.containsKey(rule.hashCode())) {
                  leftMap.put(rule.hashCode(), rule);
                }
              }
            }

            for (String entry : countRight.keySet()) {
              for (BinaryRule rule : grammar.getBinaryRulesByRightChild(entry)) {
                if (!rightMap.containsKey(rule.hashCode())) {
                  rightMap.put(rule.hashCode(), rule);
                }
              }
            }

            // System.out.println("About to enter the rules loops");
            for (Integer ruleHash : leftMap.keySet()) {
              if (rightMap.containsKey(ruleHash)) {
                BinaryRule ruleRight = rightMap.get(ruleHash);
                double prob =
                    ruleRight.getScore()
                        * parseScores.getCount(begin + " " + split, ruleRight.leftChild)
                        * parseScores.getCount(split + " " + end, ruleRight.rightChild);
                // System.out.println(begin+" "+ end +" "+ ruleRight.parent+ " "+ prob);
                if (prob > parseScores.getCount(begin + " " + end, ruleRight.parent)) {
                  // System.out.println(begin+" "+ end +" "+ ruleRight.parent+ " "+ prob);
                  // System.out.println("parentrule :"+ ruleRight.getParent());
                  parseScores.setCount(begin + " " + end, ruleRight.getParent(), prob);
                  backHash.put(
                      begin + " " + end + " " + ruleRight.parent,
                      new Triplet<Integer, String, String>(
                          split, ruleRight.leftChild, ruleRight.rightChild));
                }
              }
            }

            // System.out.println("Exited rules loop");

          }
          // System.out.println("Grammar found for " + begin + " "+ end);
          // Now handle unary rules
          added = true;
          while (added) {
            added = false;
            Counter<String> count = parseScores.getCounter(begin + " " + end);
            PriorityQueue<String> countAsPriorityQueue = count.asPriorityQueue();
            while (countAsPriorityQueue.hasNext()) {
              String entry = countAsPriorityQueue.next();
              List<UnaryRule> unaryRules = grammar.getUnaryRulesByChild(entry);
              for (UnaryRule rule : unaryRules) {
                double prob = rule.getScore() * parseScores.getCount(begin + " " + (end), entry);
                if (prob > parseScores.getCount(begin + " " + (end), rule.parent)) {
                  parseScores.setCount(begin + " " + (end), rule.parent, prob);

                  backHash.put(
                      begin + " " + (end) + " " + rule.parent,
                      new Triplet<Integer, String, String>(-1, entry, null));
                  added = true;
                }
              }
            }
          }

          // System.out.println("Unaries dealt for " + begin + " "+ end);

        }
      }

      // Create and return the parse tree
      Tree<String> parseTree = new Tree<String>("null");
      // System.out.println(parseScores.getCounter(0+" "+sentence.size()).toString());
      String parent = parseScores.getCounter(0 + " " + sentence.size()).argMax();
      if (parent == null) {
        System.out.println(parseScores.getCounter(0 + " " + sentence.size()).toString());
        System.out.println("THIS IS WEIRD");
      }
      parent = "ROOT";
      parseTree = getParseTreeOld(sentence, backHash, 0, sentence.size(), parent);
      // System.out.println("PARSE SCORES");
      //	System.out.println(parseScores.toString());
      // System.out.println("BACK HASH");
      // System.out.println(backHash.toString());
      //	parseTree = addRoot(parseTree);
      // System.out.println(parseTree.toString());
      // return parseTree;
      return TreeAnnotations.unAnnotateTree(parseTree);
    }
示例#18
0
文件: Tool.java 项目: laiello/perseph
  public void process() {
    boolean exceptionWhenWritingLexerFile = false;
    String lexerGrammarFileName = null; // necessary at this scope to have access in the catch below

    // Have to be tricky here when Maven or build tools call in and must new Tool()
    // before setting options. The banner won't display that way!
    if (isVerbose() && showBanner) {
      ErrorManager.info("ANTLR Parser Generator  Version " + VERSION);
      showBanner = false;
    }

    try {
      sortGrammarFiles(); // update grammarFileNames
    } catch (Exception e) {
      ErrorManager.error(ErrorManager.MSG_INTERNAL_ERROR, e);
    } catch (Error e) {
      ErrorManager.error(ErrorManager.MSG_INTERNAL_ERROR, e);
    }

    for (String grammarFileName : grammarFileNames) {
      // If we are in make mode (to support build tools like Maven) and the
      // file is already up to date, then we do not build it (and in verbose mode
      // we will say so).
      if (make) {
        try {
          if (!buildRequired(grammarFileName)) continue;
        } catch (Exception e) {
          ErrorManager.error(ErrorManager.MSG_INTERNAL_ERROR, e);
        }
      }

      if (isVerbose() && !isDepend()) {
        System.out.println(grammarFileName);
      }
      try {
        if (isDepend()) {
          BuildDependencyGenerator dep = new BuildDependencyGenerator(this, grammarFileName);
          /*
          List outputFiles = dep.getGeneratedFileList();
          List dependents = dep.getDependenciesFileList();
          System.out.println("output: "+outputFiles);
          System.out.println("dependents: "+dependents);
           */
          System.out.println(dep.getDependencies());
          continue;
        }

        Grammar grammar = getRootGrammar(grammarFileName);
        // we now have all grammars read in as ASTs
        // (i.e., root and all delegates)
        grammar.composite.assignTokenTypes();
        grammar.composite.defineGrammarSymbols();
        grammar.composite.createNFAs();

        generateRecognizer(grammar);

        if (isPrintGrammar()) {
          grammar.printGrammar(System.out);
        }

        if (isReport()) {
          GrammarReport greport = new GrammarReport(grammar);
          System.out.println(greport.toString());
          // print out a backtracking report too (that is not encoded into log)
          System.out.println(greport.getBacktrackingReport());
          // same for aborted NFA->DFA conversions
          System.out.println(greport.getAnalysisTimeoutReport());
        }
        if (isProfile()) {
          GrammarReport greport = new GrammarReport(grammar);
          Stats.writeReport(GrammarReport.GRAMMAR_STATS_FILENAME, greport.toNotifyString());
        }

        // now handle the lexer if one was created for a merged spec
        String lexerGrammarStr = grammar.getLexerGrammar();
        // System.out.println("lexer grammar:\n"+lexerGrammarStr);
        if (grammar.type == Grammar.COMBINED && lexerGrammarStr != null) {
          lexerGrammarFileName = grammar.getImplicitlyGeneratedLexerFileName();
          try {
            Writer w = getOutputFile(grammar, lexerGrammarFileName);
            w.write(lexerGrammarStr);
            w.close();
          } catch (IOException e) {
            // emit different error message when creating the implicit lexer fails
            // due to write permission error
            exceptionWhenWritingLexerFile = true;
            throw e;
          }
          try {
            StringReader sr = new StringReader(lexerGrammarStr);
            Grammar lexerGrammar = new Grammar();
            lexerGrammar.composite.watchNFAConversion = internalOption_watchNFAConversion;
            lexerGrammar.implicitLexer = true;
            lexerGrammar.setTool(this);
            File lexerGrammarFullFile =
                new File(getFileDirectory(lexerGrammarFileName), lexerGrammarFileName);
            lexerGrammar.setFileName(lexerGrammarFullFile.toString());

            lexerGrammar.importTokenVocabulary(grammar);
            lexerGrammar.parseAndBuildAST(sr);

            sr.close();

            lexerGrammar.composite.assignTokenTypes();
            lexerGrammar.composite.defineGrammarSymbols();
            lexerGrammar.composite.createNFAs();

            generateRecognizer(lexerGrammar);
          } finally {
            // make sure we clean up
            if (deleteTempLexer) {
              File outputDir = getOutputDirectory(lexerGrammarFileName);
              File outputFile = new File(outputDir, lexerGrammarFileName);
              outputFile.delete();
            }
          }
        }
      } catch (IOException e) {
        if (exceptionWhenWritingLexerFile) {
          ErrorManager.error(ErrorManager.MSG_CANNOT_WRITE_FILE, lexerGrammarFileName, e);
        } else {
          ErrorManager.error(ErrorManager.MSG_CANNOT_OPEN_FILE, grammarFileName);
        }
      } catch (Exception e) {
        ErrorManager.error(ErrorManager.MSG_INTERNAL_ERROR, grammarFileName, e);
      }
      /*
      finally {
      System.out.println("creates="+ Interval.creates);
      System.out.println("hits="+ Interval.hits);
      System.out.println("misses="+ Interval.misses);
      System.out.println("outOfRange="+ Interval.outOfRange);
      }
       */
    }
  }