List<Tree> prune(List<Tree> treeList, Label label, int start, int end) {
   // get reference tree
   if (treeList.size() == 1) {
     return treeList;
   }
   Tree testTree = treeList.get(0).treeFactory().newTreeNode(label, treeList);
   int goal = Numberer.getGlobalNumberer("states").number(label.value());
   Tree tempTree = parser.extractBestParse(goal, start, end);
   // parser.restoreUnaries(tempTree);
   Tree pcfgTree = debinarizer.transformTree(tempTree);
   Set<Constituent> pcfgConstituents =
       pcfgTree.constituents(new LabeledScoredConstituentFactory());
   // delete child labels that are not in reference but do not cross reference
   List<Tree> prunedChildren = new ArrayList<Tree>();
   int childStart = 0;
   for (int c = 0, numCh = testTree.numChildren(); c < numCh; c++) {
     Tree child = testTree.getChild(c);
     boolean isExtra = true;
     int childEnd = childStart + child.yield().size();
     Constituent childConstituent =
         new LabeledScoredConstituent(childStart, childEnd, child.label(), 0);
     if (pcfgConstituents.contains(childConstituent)) {
       isExtra = false;
     }
     if (childConstituent.crosses(pcfgConstituents)) {
       isExtra = false;
     }
     if (child.isLeaf() || child.isPreTerminal()) {
       isExtra = false;
     }
     if (pcfgTree.yield().size() != testTree.yield().size()) {
       isExtra = false;
     }
     if (!label.value().startsWith("NP^NP")) {
       isExtra = false;
     }
     if (isExtra) {
       System.err.println(
           "Pruning: "
               + child.label()
               + " from "
               + (childStart + start)
               + " to "
               + (childEnd + start));
       System.err.println("Was: " + testTree + " vs " + pcfgTree);
       prunedChildren.addAll(child.getChildrenAsList());
     } else {
       prunedChildren.add(child);
     }
     childStart = childEnd;
   }
   return prunedChildren;
 }
  public Object formResult() {
    Set brs = new HashSet();
    Set urs = new HashSet();
    // scan each rule / history pair
    int ruleCount = 0;
    for (Iterator pairI = rulePairs.keySet().iterator(); pairI.hasNext(); ) {
      if (ruleCount % 100 == 0) {
        System.err.println("Rules multiplied: " + ruleCount);
      }
      ruleCount++;
      Pair rulePair = (Pair) pairI.next();
      Rule baseRule = (Rule) rulePair.first;
      String baseLabel = (String) ruleToLabel.get(baseRule);
      List history = (List) rulePair.second;
      double totalProb = 0;
      for (int depth = 1; depth <= HISTORY_DEPTH() && depth <= history.size(); depth++) {
        List subHistory = history.subList(0, depth);
        double c_label = labelPairs.getCount(new Pair(baseLabel, subHistory));
        double c_rule = rulePairs.getCount(new Pair(baseRule, subHistory));
        // System.out.println("Multiplying out "+baseRule+" with history "+subHistory);
        // System.out.println("Count of "+baseLabel+" with "+subHistory+" is "+c_label);
        // System.out.println("Count of "+baseRule+" with "+subHistory+" is "+c_rule );

        double prob = (1.0 / HISTORY_DEPTH()) * (c_rule) / (c_label);
        totalProb += prob;
        for (int childDepth = 0; childDepth <= Math.min(HISTORY_DEPTH() - 1, depth); childDepth++) {
          Rule rule = specifyRule(baseRule, subHistory, childDepth);
          rule.score = (float) Math.log(totalProb);
          // System.out.println("Created  "+rule+" with score "+rule.score);
          if (rule instanceof UnaryRule) {
            urs.add(rule);
          } else {
            brs.add(rule);
          }
        }
      }
    }
    System.out.println("Total states: " + stateNumberer.total());
    BinaryGrammar bg = new BinaryGrammar(stateNumberer.total());
    UnaryGrammar ug = new UnaryGrammar(stateNumberer.total());
    for (Iterator brI = brs.iterator(); brI.hasNext(); ) {
      BinaryRule br = (BinaryRule) brI.next();
      bg.addRule(br);
    }
    for (Iterator urI = urs.iterator(); urI.hasNext(); ) {
      UnaryRule ur = (UnaryRule) urI.next();
      ug.addRule(ur);
    }
    return new Pair(ug, bg);
  }
Exemple #3
0
		public HashMap<String,ArrayList<TreeData>> parseAllDocs() throws IOException{ 
			String grammar =  "./jsan_resources/englishPCFG.ser.gz";
			String[] options = { "-maxLength", "120", "-retainTmpSubcategories" };
//			LexicalizedParser lp = new LexicalizedParser(grammar, options);
			
			LexicalizedParser lp = new LexicalizedParser()
			TreebankLanguagePack tlp = new PennTreebankLanguagePack();
			
			GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
			Iterable<List<? extends HasWord>> sentences;
			ArrayList<HashMap<String,ArrayList<String>>> everything = new ArrayList<HashMap<String,ArrayList<String>>>(3); 
			everything.add(0,otherSampleStrings);
			everything.add(1,authorSampleStrings);
			everything.add(2,toModifyStrings);
			Iterator<HashMap<String,ArrayList<String>>> everythingIter = everything.iterator();
			int docTypeNumber = -1; // 0 for otherSampleStrings, 1 for authorSampleStrings, 2 for toModifyStrings
			int numLoaded = 0;
			while(everythingIter.hasNext()){
				docTypeNumber++;
				HashMap<String,ArrayList<String>> currentSampleStrings = docPathFinder();
				Set<String> currentDocStrings = currentSampleStrings.keySet();
				Iterator<String> docStrIter = currentDocStrings.iterator();
				String docID;
				ArrayList<String> sentenceTokens;
				allTreeProcessors[docTypeNumber]  = new TreeProcessor();
				allTreeProcessors[docTypeNumber].clearLoadedTreeDataMaps();
				numLoaded=0;
				while(docStrIter.hasNext()){
					docID = docStrIter.next();
					sentenceTokens = currentSampleStrings.get(docID);
					if(sentenceTokens == null){
						allTreeProcessors[docTypeNumber].loadTreeDataMap(docID, GRAMMAR_DIR, false);
						numLoaded++;
						continue;
					}
					//System.out.println(sentenceTokens.size()+", strIter.hasNext? -> "+strIter.hasNext());

					numSentences = sentenceTokens.size();
					//initialize(numSentences);
					Iterator<String> sentIter = sentenceTokens.iterator();
					List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>();
					String tempSent;
					while(sentIter.hasNext()){
						tempSent = sentIter.next();
						Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(tempSent));
						List<? extends HasWord> sentenceTokenized = toke.tokenize();
						tmp.add(sentenceTokenized);
					}
					
					sentences = tmp;
					//int numDone = 0;
					TreeProcessor.singleDocMap.clear();
					boolean willSaveResults = true;
					for (List<? extends HasWord> sentence : sentences) {
						Tree parse = lp.apply(sentence);
						//parse.pennPrint();
						//System.out.println(parse.treeSkeletonCopy().toString());
						//System.out.println(parse.taggedYield());
						//System.out.println();
						//printSubTrees(parse);
						//TreeContainer.recurseTree(parse,"breadth");
						allTreeProcessors[docTypeNumber].processTree(parse, 0, willSaveResults); 
						//System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\("));
						//numDone++;
						//System.out.println("sent "+numDone+" of "+numSentences+" done ");
						//System.out.println(tc.processedTrees.toString());
						//in.nextLine();
						//TreeContainer.recurseTree(parse, "depth");
						//in.nextLine();
						//addTree(parse);
						//GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);//TODO: LOOK AT THIS
						//Collection tdl = gs.typedDependenciesCCprocessed(true);
						//System.out.println(tdl);
						//System.out.println();
					}
					if(willSaveResults == true)
						ObjectIO.writeObject(TreeProcessor.singleDocMap,docID, GRAMMAR_DIR);

					//System.out.println("After all sents: ");
					//System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\("));
					//String sent3 = "This is one last test!";
					//Tree parse3 = lp.apply(sent3);
					//parse3.pennPrint();
					//System.out.println("After sorting and writing:");
					//System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\("));
					//Scanner in = new Scanner(System.in);
					//System.out.println("First one done.");
					//in.nextLine();
					//viewTrees();
				}
				
				//TreeProcessor.writeTreeDataToCSV(sortedTD,docID);
				allTreeProcessors[docTypeNumber].unmergedMaps = new ArrayList<HashMap<String,TreeData>>(numLoaded+1);
				
			}	
			
			
			int i= 0;
			allParsedAndOrdered.clear();
			String[] docTypes = new String[]{"otherSample","authorSample","toModify"};
			for(i=0; i < 3; i++){
				allTreeProcessors[i].unmergedMaps.add(allTreeProcessors[i].processedTrees);
				allTreeProcessors[i].unmergedMaps.addAll(allTreeProcessors[i].loadedTreeDataMaps);
				allTreeProcessors[i].mergeTreeDataLists(allTreeProcessors[i].unmergedMaps);
				allParsedAndOrdered.put(docTypes[i],allTreeProcessors[i].sortTreeData(allTreeProcessors[i].mergedMap));
				
			}
			
			//ArrayList<TreeData> sortedTD = TreeContainer.sortTreeData(TreeContainer.allProcessedTrees);
			//TreeContainer.writeTreeDataToCSV(sortedTD,"ALL_AUTHORS");
			
			return allParsedAndOrdered;
		}
  /**
   * Tests that we can extract the basic grammatical relations correctly from some hard-coded trees.
   *
   * <p>Sentence examples from the manual to at least test each relation.
   */
  public void testBasicRelations() {
    Pair<String, String>[] examples =
        ErasureUtils.uncheckedCast(
            new Pair[] {
              // Gloss: Shanghai Pudong de orderly advance
              T(
                  "(NP (DNP (NP (NP (NR 浦东)) (NP (NN 开发))) (DEG 的)) (ADJP (JJ 有序)) (NP (NN 进行)))",
                  C(
                      "nn(开发-2, 浦东-1)",
                      "assmod(进行-5, 开发-2)",
                      "case(开发-2, 的-3)",
                      "amod(进行-5, 有序-4)",
                      "root(ROOT-0, 进行-5)")),

              // Gloss: Shanghai Pudong expand and legal-system synchronous
              T(
                  "(ROOT (IP (NP (NP (NR 上海) (NR 浦东)) (NP (NN 开发) (CC 与) (NN 法制) (NN 建设))) (VP (VV 同步))))",
                  C(
                      "nn(浦东-2, 上海-1)",
                      "nn(建设-6, 浦东-2)",
                      "conj(建设-6, 开发-3)",
                      "cc(建设-6, 与-4)",
                      "nn(建设-6, 法制-5)",
                      "nsubj(同步-7, 建设-6)",
                      "root(ROOT-0, 同步-7)")),

              // Gloss: this-year
              T("(LCP (NP (NT 近年)) (LC 来))", C("root(ROOT-0, 近年-1)", "case(近年-1, 来-2)")),

              // Gloss: according country and Shanghai de relevant law
              T(
                  "(PP (P 根据) (NP (DNP (NP (NP (NN 国家)) (CC 和) (NP (NR 上海市))) (DEG 的)) (ADJP (JJ 有关)) (NP (NN 规定))))",
                  C(
                      "case(规定-7, 根据-1)",
                      "conj(上海市-4, 国家-2)",
                      "cc(上海市-4, 和-3)",
                      "assmod(规定-7, 上海市-4)",
                      "case(上海市-4, 的-5)",
                      "amod(规定-7, 有关-6)",
                      "root(ROOT-0, 规定-7)")),

              // Gloss: building is expand Shanghai de primary economic activity
              T(
                  "(IP (NP (NN 建筑)) (VP (VC 是) (NP (CP (IP (VP (VV 开发) (NP (NR 浦东)))) (DEC 的)) (QP (CD 一) (CLP (M 项))) (ADJP (JJ 主要)) (NP (NN 经济) (NN 活动)))))",
                  C(
                      "nsubj(活动-10, 建筑-1)",
                      "cop(活动-10, 是-2)",
                      "relcl(活动-10, 开发-3)",
                      "dobj(开发-3, 浦东-4)",
                      "mark(开发-3, 的-5)",
                      "nummod(项-7, 一-6)",
                      "clf(活动-10, 项-7)",
                      "amod(活动-10, 主要-8)",
                      "nn(活动-10, 经济-9)",
                      "root(ROOT-0, 活动-10)")),

              // Gloss: nickel has-been named modern industry de vitamins
              T(
                  "(IP (NP (NN 镍)) (VP (SB 被) (VP (VV 称作) (NP (PU “) (DNP (NP (ADJP (JJ 现代)) (NP (NN 工业))) (DEG 的)) (NP (NN 维生素)) (PU ”)))))",
                  C(
                      "nsubjpass(称作-3, 镍-1)",
                      "auxpass(称作-3, 被-2)",
                      "root(ROOT-0, 称作-3)",
                      "punct(维生素-8, “-4)",
                      "amod(工业-6, 现代-5)",
                      "assmod(维生素-8, 工业-6)",
                      "case(工业-6, 的-7)",
                      "dobj(称作-3, 维生素-8)",
                      "punct(维生素-8, ”-9)")),

              // Gloss: once revealed then was included legal-system path
              T(
                  "(IP (VP (VP (ADVP (AD 一)) (VP (VV 出现))) (VP (ADVP (AD 就)) (VP (SB 被) (VP (VV 纳入) (NP (NN 法制) (NN 轨道)))))))))))",
                  C(
                      "advmod(出现-2, 一-1)",
                      "root(ROOT-0, 出现-2)",
                      "advmod(纳入-5, 就-3)",
                      "auxpass(纳入-5, 被-4)",
                      "dep(出现-2, 纳入-5)",
                      "nn(轨道-7, 法制-6)",
                      "dobj(纳入-5, 轨道-7)")),
              T(
                  "(IP (NP (NP (NR 格林柯尔)) (NP (NN 制冷剂)) (PRN (PU () (NP (NR 中国)) (PU ))) (ADJP (JJ 有限)) (NP (NN 公司))) (VP (VC 是) (NP (CP (CP (IP (NP (NP (NR 格林柯尔) (NN 集团) (NR 北美) (NN 公司)) (CC 与) (NP (NP (NR 中国) (NR 天津)) (NP (NN 开发区)) (ADJP (JJ 总)) (NP (NN 公司))) (CC 和) (NP (NP (NR 中国)) (NP (NR 南方)) (NP (NN 证券)) (ADJP (JJ 有限)) (NP (NN 公司)))) (VP (VV 合建))) (DEC 的))) (ADJP (JJ 合资)) (NP (NN 企业)))) (PU 。))",
                  C(
                      "nn(公司-7, 格林柯尔-1)",
                      "nn(公司-7, 制冷剂-2)",
                      "punct(中国-4, (-3)",
                      "prnmod(公司-7, 中国-4)",
                      "punct(中国-4, )-5)",
                      "amod(公司-7, 有限-6)",
                      "nsubj(企业-28, 公司-7)",
                      "cop(企业-28, 是-8)",
                      "nn(公司-12, 格林柯尔-9)",
                      "nn(公司-12, 集团-10)",
                      "nn(公司-12, 北美-11)",
                      "conj(公司-24, 公司-12)",
                      "cc(公司-24, 与-13)",
                      "nn(天津-15, 中国-14)",
                      "nn(公司-18, 天津-15)",
                      "nn(公司-18, 开发区-16)",
                      "amod(公司-18, 总-17)",
                      "conj(公司-24, 公司-18)",
                      "cc(公司-24, 和-19)",
                      "nn(公司-24, 中国-20)",
                      "nn(公司-24, 南方-21)",
                      "nn(公司-24, 证券-22)",
                      "amod(公司-24, 有限-23)",
                      "nsubj(合建-25, 公司-24)",
                      "relcl(企业-28, 合建-25)",
                      "mark(合建-25, 的-26)",
                      "amod(企业-28, 合资-27)",
                      "root(ROOT-0, 企业-28)",
                      "punct(企业-28, 。-29)")),
              T(
                  "(IP (NP (NR 汕头) (NN 机场)) (VP (VV 开通) (NP (NN 国际) (NN 国内) (NN 航线)) (QP (CD 四十四) (CLP (M 条)))) (PU 。))",
                  C(
                      "nn(机场-2, 汕头-1)",
                      "nsubj(开通-3, 机场-2)",
                      "root(ROOT-0, 开通-3)",
                      "nn(航线-6, 国际-4)",
                      "nn(航线-6, 国内-5)",
                      "dobj(开通-3, 航线-6)",
                      "nummod(条-8, 四十四-7)",
                      "range(开通-3, 条-8)",
                      "punct(开通-3, 。-9)")),
              T(
                  "(VP (NP (NT 以前)) (ADVP (AD 不)) (ADVP (AD 曾)) (VP (VV 遇到) (AS 过))))",
                  C(
                      "tmod(遇到-4, 以前-1)",
                      "neg(遇到-4, 不-2)",
                      "advmod(遇到-4, 曾-3)",
                      "root(ROOT-0, 遇到-4)",
                      "asp(遇到-4, 过-5)")),

              // TODO(pliang): add more test cases for all the relations not covered (see WARNING
              // below)
            });

    Set<String> ignoreRelations = new HashSet<>(Arrays.asList("subj", "obj", "mod"));
    // Make sure all the relations are tested for
    Set<String> testedRelations = new HashSet<String>();
    for (Pair<String, String> ex : examples) {
      for (String item : ex.second.split("\n"))
        testedRelations.add(item.substring(0, item.indexOf('(')));
    }
    for (String relation : UniversalChineseGrammaticalRelations.shortNameToGRel.keySet()) {
      if (!testedRelations.contains(relation))
        if (!ignoreRelations.contains(relation)) {
          System.err.println("WARNING: relation '" + relation + "' not tested");
        }
    }

    TreeReaderFactory trf = new PennTreeReaderFactory();
    for (Pair<String, String> ex : examples) {
      String testTree = ex.first;
      String testAnswer = ex.second;

      // specifying our own TreeReaderFactory is vital so that functional
      // categories - that is -TMP and -ADV in particular - are not stripped off
      Tree tree = Tree.valueOf(testTree, trf);
      GrammaticalStructure gs =
          new UniversalChineseGrammaticalStructure(tree, Filters.acceptFilter()); // include punct

      assertEquals(
          "Unexpected CC processed dependencies for tree " + testTree,
          testAnswer,
          UniversalChineseGrammaticalStructure.dependenciesToString(
              gs,
              gs.typedDependenciesCCprocessed(GrammaticalStructure.Extras.MAXIMAL),
              tree,
              false,
              false));
    }
  }