private void makeNewTerm() { Term term = new Term(sb.toString(), offe, tempNature.natureStr, 1); term.selfScore = score; term.setNature(tempNature); if (sb.length() > 3) { term.setSubTerm(TermUtil.getSubTerm(from, to)); } TermUtil.termLink(from, term); TermUtil.termLink(term, to); TermUtil.insertTerm(terms, term); TermUtil.parseNature(term); }
private List<Term> recogntion_() { Term term = null; Term tempTerm = null; List<Term> termList = new ArrayList<Term>(); int beginFreq = 10; for (int i = 0; i < terms.length; i++) { term = terms[i]; if (term == null || !term.getTermNatures().personAttr.flag) { continue; } term.score = 0; term.selfScore = 0; int freq = 0; for (int j = 2; j > -1; j--) { freq = term.getTermNatures().personAttr.getFreq(j, 0); if ((freq > 10) || (term.getName().length() == 2 && freq > 10)) { tempTerm = nameFind(i, beginFreq, j); if (tempTerm != null) { termList.add(tempTerm); // 如果是无争议性识别 if (skip) { for (int j2 = i; j2 < tempTerm.getToValue(); j2++) { if (terms[j2] != null) { terms[j2].score = 0; terms[j2].selfScore = 0; } } i = tempTerm.getToValue() - 1; break; } } } } beginFreq = term.getTermNatures().personAttr.begin + 1; } return termList; }
public void recognition() { if (branch == null) { return; } int length = terms.length - 1; Term term = null; for (int i = 0; i < length; i++) { if (terms[i] == null) { continue; } else { from = terms[i].getFrom(); terms[i].score = 0; terms[i].selfScore = 0; } branch = branch.getBranch(terms[i].getName()); if (branch == null || branch.getStatus() == 3) { reset(); continue; } offe = i; // 循环查找添加 term = terms[i]; sb.append(term.getName()); if (branch.getStatus() == 2) { term.selfScore = branch.getParam().getScore(); } boolean flag = true; while (flag) { term = term.getTo(); branch = branch.getBranch(term.getName()); // 如果没有找到跳出 if (branch == null) { break; } switch (branch.getStatus()) { case 1: sb.append(term.getName()); continue; case 2: sb.append(term.getName()); score = branch.getParam().getScore(); tempNature = branch.getParam().getNature(); to = term.getTo(); makeNewTerm(); continue; case 3: sb.append(term.getName()); score = branch.getParam().getScore(); tempNature = branch.getParam().getNature(); to = term.getTo(); makeNewTerm(); flag = false; break; default: System.out.println("怎么能出现0呢?"); break; } } reset(); } }
/** * 人名识别 * * @param term * @param offe * @param freq */ private Term nameFind(int offe, int beginFreq, int size) { // TODO Auto-generated method stub StringBuilder sb = new StringBuilder(); int undefinite = 0; skip = false; PersonNatureAttr pna = null; int index = 0; int freq = 0; double allFreq = 0; Term term = null; int i = offe; for (; i < terms.length; i++) { // 走到结尾处识别出来一个名字. if (terms[i] == null) { continue; } term = terms[i]; pna = term.getTermNatures().personAttr; // 在这个长度的这个位置的词频,如果没有可能就干掉,跳出循环 if ((freq = pna.getFreq(size, index)) == 0) { return null; } if (pna.allFreq > 0) { undefinite++; } sb.append(term.getName()); allFreq += Math.log(term.getTermNatures().allFreq + 1); allFreq += -Math.log((freq)); index++; if (index == size + 2) { break; } } double score = -Math.log(FACTORY[size]); score += allFreq; double endFreq = 0; // 开始寻找结尾词 boolean flag = true; while (flag) { i++; if (i >= terms.length) { endFreq = 10; flag = false; } else if (terms[i] != null) { int twoWordFreq = NgramLibrary.getTwoWordFreq(term, terms[i]); if (twoWordFreq > 3) { return null; } endFreq = terms[i].getTermNatures().personAttr.end + 1; flag = false; } } score -= Math.log(endFreq); score -= Math.log(beginFreq); if (score > -3) { return null; } if (allFreq > 0 && undefinite > 0) { return null; } skip = undefinite == 0; term = new Term(sb.toString(), offe, TermNatures.NR); term.selfScore = score; return term; }