public void addNumbers(DataHolder dataholderHandler) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.addNumbers"); myLogger.trace("Add numbers"); List<String> nums = new ArrayList<String>(); nums.addAll(Arrays.asList(this.myLearnerUtility.getConstant().NUMBER.split("\\|"))); // System.out.println(nums); // System.out.println(this.myLearnerUtility.getConstant().NUMBER); for (int i = 0; i < nums.size(); i++) { String word = nums.get(i); // String reg="\\b("+this.myLearnerUtility.getConstant().FORBIDDEN+")\\b"; // boolean f = word.matches(reg); if (word.matches("\\b(" + this.myLearnerUtility.getConstant().FORBIDDEN + ")\\b")) { continue; } dataholderHandler.updateDataHolder(word, "b", "*", "wordpos", 0); // this.getWordPOSHolder().put(new WordPOSKey(word, "b"), new // WordPOSValue("*", 0, 0, null, null)); // System.out.println("add Number: " + word); } dataholderHandler.updateDataHolder("NUM", "b", "*", "wordpos", 0); // this.getWordPOSHolder().put(new WordPOSKey("NUM", "b"), new // WordPOSValue("*",0, 0, null, null)); }
public void addProperNouns(DataHolder dataholderHandler) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.addProperNouns"); myLogger.trace("Add proper nouns"); List<String> ppnouns = new ArrayList<String>(); ppnouns.addAll(Arrays.asList(Constant.PROPERNOUN.split("\\|"))); for (int i = 0; i < ppnouns.size(); i++) { String word = ppnouns.get(i); if (word.matches("\\b(" + this.myLearnerUtility.getConstant().FORBIDDEN + ")\\b")) { continue; } dataholderHandler.updateDataHolder(word, "b", "*", "wordpos", 0); // this.getWordPOSHolder().put(new WordPOSKey(word, "z"), new // WordPOSValue("*", 0, 0, null, null)); // System.out.println("Add ProperNoun: " + word); } }
public void addClusterStrings(DataHolder dataholderHandler) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.addClusterstrings"); myLogger.trace("Add clusterstrings"); List<String> cltstrs = new ArrayList<String>(); cltstrs.addAll(Arrays.asList(this.myLearnerUtility.getConstant().CLUSTERSTRING.split("\\|"))); // System.out.println(cltstrs); // System.out.println(this.myLearnerUtility.getConstant().CLUSTERSTRING); for (int i = 0; i < cltstrs.size(); i++) { String word = cltstrs.get(i); if (word.matches("\\b(" + this.myLearnerUtility.getConstant().FORBIDDEN + ")\\b")) { continue; } dataholderHandler.updateDataHolder(word, "b", "*", "wordpos", 0); // this.getWordPOSHolder().put(new WordPOSKey(word, "b"), new // WordPOSValue("*", 1, 1, null, null)); // System.out.println("addClusterString: " + word); } }
public void addStopWords(DataHolder dataholderHandler) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.addStopWords"); myLogger.trace("Add stop words"); List<String> stops = new ArrayList<String>(); stops.addAll(Arrays.asList(this.myLearnerUtility.getConstant().STOP.split("\\|"))); stops.addAll(Arrays.asList(new String[] {"NUM", "(", "[", "{", ")", "]", "}", "d+"})); myLogger.trace("Stop Words: " + stops); for (int i = 0; i < stops.size(); i++) { String word = stops.get(i); if (word.matches("\\b(" + this.myLearnerUtility.getConstant().FORBIDDEN + ")\\b")) { continue; } dataholderHandler.updateDataHolder(word, "b", "*", "wordpos", 0); myLogger.trace(String.format("(\"%s\", \"b\", \"*\", \"wordpos\", 0) added\n", word)); // this.getWordPOSHolder().put(new WordPOSKey(word, "b"), new // WordPOSValue("*", 0, 0, null, null)); // System.out.println("Add Stop Word: " + word+"\n"); } myLogger.trace("Quite\n"); }
/** * comma used for 'and': seen in TreatiseH, using comma for 'and' as in "adductor , diductor scars * clearly differentiated ;", which is the same as "adductor and diductor scars clearly * differentiated ;". ^m*n+,m*n+ or m*n+,m*n+;$, or m,mn. Clauses dealt in commaand do not contain * "and/or". andortag() deals with clauses that do. * * @param dataholderHandler */ public void commaAnd(DataHolder dataholderHandler) { // cover m,mn // last + =>* // "(?:<[A-Z]*[NO]+[A-Z]*>[^<]+?<\/[A-Z]*[NO]+[A-Z]*>\\s*)+" String nPhrasePattern = "(?:<[A-Z]*[NO]+[A-Z]*>[^<]+?<\\/[A-Z]*[NO]+[A-Z]*>\\s*)+"; // add last \\s* // "(?:<[A-Z]*M[A-Z]*>[^<]+?<\/[A-Z]*M[A-Z]*>\\s*)" String mPhrasePattern = "(?:<[A-Z]*M[A-Z]*>[^<]+?<\\/[A-Z]*M[A-Z]*>\\s*)"; // "(?:<[A-Z]*B[A-Z]*>[,:\.;<]<\/[A-Z]*B[A-Z]*>)" String bPattern = "(?:<[A-Z]*B[A-Z]*>[,:.;<]<\\/[A-Z]*B[A-Z]*>)"; String commaPattern = "<B>,</B>"; String phrasePattern = mPhrasePattern + "\\s*" + nPhrasePattern; String pattern = phrasePattern + "\\s+" + commaPattern + "\\s+(?:" + phrasePattern + "| |" + commaPattern + ")+"; String pattern1 = "^(" + pattern + ")"; String pattern2 = "(.*?)(" + pattern + ")\\s*" + bPattern + "\\$"; // changed last * to + String pattern3 = "^((?:" + mPhrasePattern + "\\s+)+" + commaPattern + "\\s+(?:" + mPhrasePattern + "|\\s*|" + commaPattern + ")+" + mPhrasePattern + "+\\s*" + nPhrasePattern + ")"; for (SentenceStructure sentenceItem : dataholderHandler.getSentenceHolder()) { int sentenceID = sentenceItem.getID(); String sentence = sentenceItem.getSentence(); String sentenceCopy = "" + sentence; sentenceCopy = sentenceCopy.replaceAll("></?", ""); Matcher m1 = StringUtility.createMatcher(sentenceCopy, pattern1); Matcher m2 = StringUtility.createMatcher(sentenceCopy, pattern2); Matcher m3 = StringUtility.createMatcher(sentenceCopy, pattern3); // case 1 if (m1.find()) { String tag = m1.group(1); tag = tag.replaceAll(",", "and"); tag = tag.replaceAll("</?\\S+?>", ""); tag = StringUtility.trimString(tag); // case 1.1 if (!StringUtility.isMatchedNullSafe(tag, " and$")) { dataholderHandler.tagSentenceWithMT(sentenceID, sentence, "", tag, "commaand[CA1]"); } } // case 2 else if (m2.find()) { String g1 = m2.group(1); String tag = m2.group(2); if (!StringUtility.isMatchedNullSafe( g1, "\\b(" + this.myLearnerUtility.getConstant().PREPOSITION + ")\\b") && !StringUtility.isMatchedNullSafe(g1, "<N>")) { tag = tag.replaceAll(",", "and"); tag = tag.replaceAll("</?\\S+?>", ""); tag = StringUtility.trimString(tag); // case 2.1.1 if (!StringUtility.isMatchedNullSafe(tag, " and$")) { dataholderHandler.tagSentenceWithMT(sentenceID, sentence, "", tag, "commaand[CA2]"); } } } // case 3 else if (m3.find()) { String tag = m3.group(1); String g1 = m3.group(1); // case 3.1 if (!StringUtility.isMatchedNullSafe( g1, "\\b(" + this.myLearnerUtility.getConstant().PREPOSITION + ")\\b")) { tag = tag.replaceAll(",", "and"); tag = tag.replaceAll("</?\\S+?>", ""); tag = StringUtility.trimString(tag); // case 3.1.1 if (!StringUtility.isMatchedNullSafe(tag, " and$")) { String[] tagWords = tag.split("\\s+"); List<String> tagWordsList = new ArrayList<String>(Arrays.asList(tagWords)); tag = tagWordsList.get(tagWordsList.size() - 1); String modifier = StringUtils.join(tagWordsList.subList(0, tagWordsList.size() - 1), " "); dataholderHandler.tagSentenceWithMT( sentenceID, sentence, modifier, tag, "commaand[CA3]"); } } } } }