/** 人名消歧,比如.邓颖超生前->邓颖 超生 前 fix to 丁颖超 生 前! 规则的方式增加如果两个人名之间连接是- , ·,•则连接 */ public static void nameAmbiguity(Term[] terms) { Term from = null; Term term = null; Term next = null; for (int i = 0; i < terms.length - 1; i++) { term = terms[i]; if (term != null && term.termNatures() == TermNatures.NR && term.getName().length() == 2) { next = terms[i + 2]; if (next.termNatures().personAttr.split > 0) { term.setName(term.getName() + next.getName().charAt(0)); terms[i + 2] = null; terms[i + 3] = new Term(next.getName().substring(1), next.getOffe(), TermNatures.NW); TermUtil.termLink(term, terms[i + 3]); TermUtil.termLink(terms[i + 3], next.to()); } } } // 外国人名修正 for (int i = 0; i < terms.length; i++) { term = terms[i]; if (term != null && term.getName().length() == 1 && i > 0 && WordAlert.CharCover(term.getName().charAt(0)) == '·') { from = term.from(); next = term.to(); if (from.natrue().natureStr.startsWith("nr") && next.natrue().natureStr.startsWith("nr")) { from.setName(from.getName() + term.getName() + next.getName()); TermUtil.termLink(from, next.to()); terms[i] = null; terms[i + 1] = null; } } } }
/** * 从一个词的词性到另一个词的词的分数 * * @param form 前面的词 * @param to 后面的词 * @return 分数 */ public static double compuScore(Term from, Term to) { double frequency = from.termNatures().allFreq + 1; if (frequency < 0) { double score = from.score() + MAX_FREQUENCE; from.score(score); return score; } int nTwoWordsFreq = NgramLibrary.getTwoWordFreq(from, to); double value = -Math.log( dSmoothingPara * frequency / (MAX_FREQUENCE + 80000) + (1 - dSmoothingPara) * ((1 - dTemp) * nTwoWordsFreq / frequency + dTemp)); if (value < 0) { value += frequency; } return from.score() + value; }
/** * 词性词频词长.计算出来一个分数 * * @param from * @param term * @return */ public static double compuScoreFreq(Term from, Term term) { // TODO Auto-generated method stub return from.termNatures().allFreq + term.termNatures().allFreq; }