private String getParentOrgan( Document document, Element description, Element parentStructure, String parentOfParentStructure) { // check for 'part_of' relation on parentstructure String partOfChain = getPartOfChain(document, description, parentStructure, 0) .replace(" of ", ",") .trim(); // part of organ of organ if (!partOfChain.isEmpty()) { // use explicit part_of parentOfParentStructure = partOfChain; } // add constraint organ to parentorgan list String parentStructureConstraint = parentStructure.getAttributeValue("constraint"); if (parentStructureConstraint != null) { if (characterKnowledgeBase.isEntityStructuralContraint(parentStructureConstraint)) { // parentorgan = constraint; //use the constraint of parentstruct as parentorgan, e.g. leaf // blade ..., petiole ..., vein .... parentOfParentStructure = parentStructureConstraint + "," + parentOfParentStructure; // blade, leaf } } // add name organ to parentorgan list // parentorgan = parentofparentstructure+" "+parentstruct.getAttributeValue("name");//leaf blade return parentStructure.getAttributeValue("name") + "," + parentOfParentStructure; // blade, leaf }
/** * @param struct * @return true if the structure element has a constraint that refers to a(nother) structure */ private boolean hasStructuralConstraint(Element struct) { String constraint = struct.getAttributeValue("constraint"); if (constraint == null || constraint.isEmpty()) return false; if (characterKnowledgeBase.isEntityStructuralContraint(constraint)) return true; return false; }
@Override public List<Token> tag(List<Token> sentence) { List<Token> posedSentence = new ArrayList<Token>(); for (int i = 0; i < sentence.size(); i++) { Token token = sentence.get(i); String word = token.getContent(); String p = ""; boolean isState = false; boolean isOrgan = false; boolean isVerb = false; // if(word.contains("~list~") || word.contains("_c_") || // organStateKnowledgeBase.isState(word)) // don't send numbers, values to learnedCharacterKnowledgeBase if (word.contains("~list~") || word.contains("_c_") || ((word.matches("[^a-z]+") || word.contains("=") || word.matches(".*?(^|[^a-z])(" + units + ")([^a-z]|$).*")) && word.matches(".*?\\d.*"))) { // units could be mixed in the numbers isState = true; } else if (learnedCharacterKnowledgeBase.isCategoricalState(word)) { isState = true; CharacterMatch m = learnedCharacterKnowledgeBase.getCharacterName(word); if (m != null && m.getCategories() != null && m.getCategories().matches("(^|.*?_)position_relational(_.*|$)")) { isVerb = true; } } else { // isOrgan = organStateKnowledgeBase.isOrgan(word); isOrgan = learnedCharacterKnowledgeBase.isEntity(word); } Map<String, Set<String>> wordsToRoles = terminologyLearner.getWordsToRoles(); if (word.length() > 0 && !word.matches("\\W") && !word.matches("(" + prepositions + ")") && !stopWords.contains(word)) { if (wordsToRoles.containsKey(word)) p = wordsToRoles.get(word).iterator().next(); } // Hong TODO modifiertokens Matcher mc = compreppattern.matcher(word); if (word.equals("lengths") || word.equals("length")) { posedSentence.add( new POSedToken(word, POS.NNS)); // stanford parser would sometimes take "lengths" as IN } else if (word.equals("moreorless")) { posedSentence.add(new POSedToken(word, POS.RB)); } else if (mc.matches()) { posedSentence.add(new POSedToken(word, POS.IN)); } else if (word.matches("in-.*?(-view|profile)")) { posedSentence.add(new POSedToken(word, POS.RB)); } else if (word.endsWith("ly") && word.indexOf("~") < 0) { posedSentence.add(new POSedToken(word, POS.RB)); } else if (word.compareTo("becoming") == 0 || word.compareTo("about") == 0) { posedSentence.add(new POSedToken(word, POS.RB)); } else if (word.compareTo("throughout") == 0 && (i + 1 == sentence.size() || sentence.get(i + 1).getContent().matches("(,|or)"))) { posedSentence.add(new POSedToken(word, POS.RB)); } else if (word.contains("#")) { posedSentence.add(new POSedToken(word.replace('#', '-'), POS.RB)); } else if (word.compareTo("plus") == 0 || word.compareTo("and-or") == 0) { posedSentence.add(new POSedToken(word, POS.CC)); } else if (word.matches("\\d+[cmd]?m\\d+[cmd]?m")) { posedSentence.add(new POSedToken(word, POS.CD)); } else if (word.matches("(" + units + ")")) { posedSentence.add(new POSedToken(word, POS.NNS)); } else if (word.matches("as-\\S+")) { posedSentence.add(new POSedToken(word, POS.IN)); } else if (p.contains("op")) { // <inner> larger. posedSentence.add(new POSedToken(word, POS.NNS)); } else if (p.contains("os") || (p.length() == 0 && isOrgan)) { posedSentence.add(new POSedToken(word, POS.NN)); } else if (word.matches("(\\{?\\b" + roman + "\\b\\}?)")) { // mohan // code // to // mark // roman // numbers // {ii} // or ii // as // ii/NNS word = word.replaceAll("\\{|\\}", ""); posedSentence.add(new POSedToken(word, POS.NNS)); } else if (word.matches("\\d*.{0,1}\\d+")) { posedSentence.add(new POSedToken(word, POS.CD)); } else if (isVerb) { posedSentence.add(new POSedToken(word, POS.VB)); } else if (p.contains("c") || isState) { int wordFrequency = corpus.getFrequency(word); if (wordFrequency > 79) { posedSentence.add(new Token(word)); } else { posedSentence.add(new POSedToken(token.getContent(), POS.JJ)); } } else { posedSentence.add(new Token(token.getContent())); } } return posedSentence; }