public void test() { List<Term> terms = ToAnalysis.parse(line); for (Term term : terms) { String name = term.getName(); System.out.println(name); } }
/* * 修正词性 */ public static List<Term> updateNature(List<Term> all, Forest... forests) { if (forests == null) { if (UserDefineLibrary.FOREST != null) { forests = new Forest[] {UserDefineLibrary.FOREST}; } else { return all; } } List<Term> result = new ArrayList<Term>(); for (Term term : all) { // 添加对正则停用词的支持 if ((stopwordPattern != null) && stopwordPattern.matcher(term.getName()).matches()) { continue; } for (Forest forest : forests) { String[] params = UserDefineLibrary.getParams(forest, term.getName()); if (params != null) { term.setNature(new Nature(params[0])); } } result.add(term); } return result; }
@Override public EmotionResponse emotionJudge(EmotionRequest req) throws TException { long s1 = System.currentTimeMillis(); String text = req.getText(); List<Term> terms = ToAnalysis.parse(text); List<String> words = new ArrayList<>(); for (Term term : terms) { words.add(term.getName()); } // logger.info("{} is cut into {}", text, words); double score = fbsc.classify(words); EmotionResponse response = new EmotionResponse(); response.setReq(req); response.setScore(score); int emotion = 0; if (score > 0.5) { emotion = 1; } else if (score < -0.5) { emotion = 2; } else { emotion = 3; } response.setEmotion(emotion); long s2 = System.currentTimeMillis(); logger.info("emotion: {}, used:{}ms", emotion, (s2 - s1)); return response; }
public List<String> cut(String text) { List<Term> terms = ToAnalysis.parse(text); List<String> array = new ArrayList<>(); StringBuilder sbDebug = new StringBuilder(); for (Term term : terms) { String word = term.getName().trim(); if (word.length() == 0) { continue; } // System.out.println(word); if (filterSingleWord) { // 过滤单字 if (word.length() < 2) { continue; } } if (filterStopWords) { if (stopWordsSet.contains(word)) { continue; } } Matcher matcher = pureNum.matcher(word); // 含数字的词都不要 if (matcher.find()) { continue; } array.add(word); if (debug) { sbDebug.append(word + ","); } } if (debug) { logger.info("cut {} into {}", text, sbDebug.toString()); } return array; }
@Test public void test() { String newWord = "爸爸去哪儿"; String nature = "aaaaa"; String str = "上海电力2012年财务报表如下怎爸爸去哪儿么办"; // 增加新词 UserDefineLibrary.insertWord(newWord, nature, 1000); List<Term> parse = ToAnalysis.parse(str); HashMap<String, Term> hs = new HashMap<String, Term>(); for (Term term : parse) { hs.put(term.getName(), term); } Assert.assertTrue(hs.containsKey(newWord)); Assert.assertEquals(hs.get(newWord).natrue().natureStr, nature); // 删除词 UserDefineLibrary.removeWord(newWord); parse = ToAnalysis.parse(str); hs = new HashMap<String, Term>(); for (Term term : parse) { hs.put(term.getName(), term); } Assert.assertTrue(!hs.containsKey(newWord)); }
public List<NewWord> getNewWords() { // TODO Auto-generated method stub List<NewWord> all = new ArrayList<NewWord>(); List<Term> termList = recogntion_(); for (Term term2 : termList) { all.add(new NewWord(term2.getName(), TermNatures.NR, term2.selfScore, 1)); } return all; }
@Override public List<ParseTerm> parse(String sentence) { // TODO Auto-generated method stub List<ParseTerm> result = new ArrayList<ParseTerm>(); List<Term> list = ToAnalysis.parse(sentence); for (Iterator<Term> it = list.iterator(); it.hasNext(); ) { Term t = it.next(); result.add(new ParseTerm(t.getRealName(), t.getNatureStr())); } return result; }
private void makeNewTerm() { Term term = new Term(sb.toString(), offe, tempNature.natureStr, 1); term.selfScore = score; term.setNature(tempNature); if (sb.length() > 3) { term.setSubTerm(TermUtil.getSubTerm(from, to)); } TermUtil.termLink(from, term); TermUtil.termLink(term, to); TermUtil.insertTerm(terms, term); TermUtil.parseNature(term); }
public String segAnsi(String text) { String segs = ""; List<Term> parse2 = ToAnalysis.parse(text); StringBuilder sb = new StringBuilder(); for (Term term : parse2) { sb.append(term.getName()); sb.append(" "); } segs = sb.toString(); segs = segs.trim(); return segs; }
/** * 查找两个词与词之间的频率 * * @param from * @param to * @return */ public static int getTwoWordFreq(Term from, Term to) { if (from.getTermNatures().id < 0) { return 0; } BigramEntry[] be = bigramTables[from.getTermNatures().id]; int index = binarySearch(be, to.getTermNatures().id); if (index < 0) { return 0; } return be[index].freq; }
@Override public void run() { String[] strings = line.split("\t"); if (strings.length == 8) { int hfd = Integer.parseInt(strings[3]); if (hfd > 1) { String content = strings[7]; List<Term> terms = ToAnalysis.parse(content); for (Term term : terms) { String name = term.getName(); System.out.println(name); } } } }
public void recognition(Result result) { List<Term> terms = result.getTerms(); String end = null; String name; LinkedList<Term> mergeList = null; List<Term> list = new LinkedList<Term>(); for (Term term : terms) { name = term.getName(); if (end == null) { if ((end = ruleMap.get(name)) != null) { mergeList = new LinkedList<Term>(); mergeList.add(term); } else { list.add(term); } } else { mergeList.add(term); if (end.equals(name)) { Term ft = mergeList.pollFirst(); for (Term sub : mergeList) { ft.merage(sub); } ft.setNature(nature); list.add(ft); mergeList = null; end = null; } } } if (mergeList != null) { for (Term term : list) { list.add(term); } } result.setTerms(list); }
/** * 从一个词的词性到另一个词的词的分数 * * @param form 前面的词 * @param to 后面的词 * @return 分数 */ public static double compuScore(Term from, Term to) { double frequency = from.getTermNatures().allFreq + 1; if (frequency < 0) { return from.getScore() + MAX_FREQUENCE; } int nTwoWordsFreq = TwoWordLibrary.getTwoWordFreq(from, to); double value = -Math.log( dSmoothingPara * frequency / (MAX_FREQUENCE + 80000) + (1 - dSmoothingPara) * ((1 - dTemp) * nTwoWordsFreq / frequency + dTemp)); if (value < 0) value += frequency; if (value < 0) { value += frequency; } return from.getScore() + value; }
/* * 停用词过滤并且修正词性 */ public static List<Term> modifResult(List<Term> all) { List<Term> result = new ArrayList<Term>(); try { for (Term term : all) { if (FILTER.size() > 0 && (FILTER.contains(term.getName()) || (isTag && FILTER.contains(TAG + term.natrue().natureStr)))) { continue; } // 添加对正则停用词的支持 if ((stopwordPattern != null) && stopwordPattern.matcher(term.getName()).matches()) { continue; } String[] params = UserDefineLibrary.getParams(term.getName()); if (params != null) { term.setNature(new Nature(params[0])); } result.add(term); } } catch (Exception e) { MyStaticValue.LIBRARYLOG.warn( "FilterStopWord.updateDic can not be null , " + "you must use set FilterStopWord.setUpdateDic(map) or use method set map"); } return result; }
public static void main(String[] args) { Term from = new Term("阿", 0, new TermNatures(TermNature.NULL)); from.getTermNatures().id = InitDictionary.getWordId(from.getName()); Term to = new Term("全国", 0, new TermNatures(TermNature.NULL)); to.getTermNatures().id = InitDictionary.getWordId(to.getName()); System.out.println(getTwoWordFreq(from, to)); }
/* * 停用词过滤并且修正词性 */ public static List<Term> modifResult(List<Term> all) { List<Term> result = new ArrayList<Term>(); try { for (Term term : all) { if (FILTER.size() > 0 && (FILTER.contains(term.getName()) || FILTER.contains(TAG + term.getNatrue().natureStr))) { continue; } String[] params = UserDefineLibrary.getParams(term.getName()); if (params != null) { term.setNature(new Nature(params[0])); } result.add(term); } } catch (Exception e) { System.err.println( "FilterStopWord.updateDic can not be null , " + "you must use set FilterStopWord.setUpdateDic(map) or use method set map"); } return result; }
private List<Term> recogntion_() { Term term = null; Term tempTerm = null; List<Term> termList = new ArrayList<Term>(); int beginFreq = 10; for (int i = 0; i < terms.length; i++) { term = terms[i]; if (term == null || !term.getTermNatures().personAttr.flag) { continue; } term.score = 0; term.selfScore = 0; int freq = 0; for (int j = 2; j > -1; j--) { freq = term.getTermNatures().personAttr.getFreq(j, 0); if ((freq > 10) || (term.getName().length() == 2 && freq > 10)) { tempTerm = nameFind(i, beginFreq, j); if (tempTerm != null) { termList.add(tempTerm); // 如果是无争议性识别 if (skip) { for (int j2 = i; j2 < tempTerm.getToValue(); j2++) { if (terms[j2] != null) { terms[j2].score = 0; terms[j2].selfScore = 0; } } i = tempTerm.getToValue() - 1; break; } } } } beginFreq = term.getTermNatures().personAttr.begin + 1; } return termList; }
/** * 新词熵及其左右熵 * * @param all */ public static double leftRightEntropy(List<Term> all) { // TODO Auto-generated method stub double score = 0; NewWordNatureAttr newWordAttr = null; Term first = all.get(0); // 查看左右链接 int twoWordFreq = TwoWordLibrary.getTwoWordFreq(first.getFrom(), first); score -= twoWordFreq; // 查看右连接 int length = all.size() - 1; Term end = all.get(all.size() - 1); twoWordFreq = TwoWordLibrary.getTwoWordFreq(end, end.getTo()); score -= twoWordFreq; // 查看内部链接 for (int i = 0; i < length; i++) { score -= TwoWordLibrary.getTwoWordFreq(all.get(i), all.get(i + 1)); } if (score < -3) { return 0; } // 首字分数 newWordAttr = first.getTermNatures().newWordAttr; score += getTermScore(newWordAttr, newWordAttr.getB()); // 末字分数 newWordAttr = end.getTermNatures().newWordAttr; score += getTermScore(newWordAttr, newWordAttr.getE()); // 中词分数 double midelScore = 0; Term term = null; for (int i = 1; i < length; i++) { term = all.get(i); newWordAttr = term.getTermNatures().newWordAttr; midelScore += getTermScore(newWordAttr, newWordAttr.getM()); } score += midelScore / (length); return score; }
@Override public HashMap<String, TermScore> tokenizeTerm(String input_str) { tokens = ToAnalysis.parse(input_str); token_iterator = tokens.listIterator(); HashMap<String, TermScore> hash = new HashMap<String, TermScore>(); while (token_iterator.hasNext()) { Term term = token_iterator.next(); if (hash.get(term.getName()) == null) hash.put(term.getName(), new TermScore(term.getName(), 0)); else { TermScore exist_term = hash.get(term.getName()); int new_score = exist_term.getScore() + 1; exist_term.setScore(new_score); hash.put(term.getName(), exist_term); } } return hash; }
/** 人名消歧,比如.邓颖超生前->邓颖 超生 前 fix to 丁颖超 生 前! 规则的方式增加如果两个人名之间连接是- , ·,•则连接 */ public static void nameAmbiguity(Term[] terms) { Term from = null; Term term = null; Term next = null; for (int i = 0; i < terms.length - 1; i++) { term = terms[i]; if (term != null && term.termNatures() == TermNatures.NR && term.getName().length() == 2) { next = terms[i + 2]; if (next.termNatures().personAttr.split > 0) { term.setName(term.getName() + next.getName().charAt(0)); terms[i + 2] = null; terms[i + 3] = new Term(next.getName().substring(1), next.getOffe(), TermNatures.NW); TermUtil.termLink(term, terms[i + 3]); TermUtil.termLink(terms[i + 3], next.to()); } } } // 外国人名修正 for (int i = 0; i < terms.length; i++) { term = terms[i]; if (term != null && term.getName().length() == 1 && i > 0 && WordAlert.CharCover(term.getName().charAt(0)) == '·') { from = term.from(); next = term.to(); if (from.natrue().natureStr.startsWith("nr") && next.natrue().natureStr.startsWith("nr")) { from.setName(from.getName() + term.getName() + next.getName()); TermUtil.termLink(from, next.to()); terms[i] = null; terms[i + 1] = null; } } } }
/** * 词性词频词长.计算出来一个分数 * * @param from * @param term * @return */ public static double compuScoreFreq(Term from, Term term) { // TODO Auto-generated method stub return from.getTermNatures().allFreq + term.getTermNatures().allFreq; }
/** * 人名识别 * * @param term * @param offe * @param freq */ private Term nameFind(int offe, int beginFreq, int size) { // TODO Auto-generated method stub StringBuilder sb = new StringBuilder(); int undefinite = 0; skip = false; PersonNatureAttr pna = null; int index = 0; int freq = 0; double allFreq = 0; Term term = null; int i = offe; for (; i < terms.length; i++) { // 走到结尾处识别出来一个名字. if (terms[i] == null) { continue; } term = terms[i]; pna = term.getTermNatures().personAttr; // 在这个长度的这个位置的词频,如果没有可能就干掉,跳出循环 if ((freq = pna.getFreq(size, index)) == 0) { return null; } if (pna.allFreq > 0) { undefinite++; } sb.append(term.getName()); allFreq += Math.log(term.getTermNatures().allFreq + 1); allFreq += -Math.log((freq)); index++; if (index == size + 2) { break; } } double score = -Math.log(FACTORY[size]); score += allFreq; double endFreq = 0; // 开始寻找结尾词 boolean flag = true; while (flag) { i++; if (i >= terms.length) { endFreq = 10; flag = false; } else if (terms[i] != null) { int twoWordFreq = NgramLibrary.getTwoWordFreq(term, terms[i]); if (twoWordFreq > 3) { return null; } endFreq = terms[i].getTermNatures().personAttr.end + 1; flag = false; } } score -= Math.log(endFreq); score -= Math.log(beginFreq); if (score > -3) { return null; } if (allFreq > 0 && undefinite > 0) { return null; } skip = undefinite == 0; term = new Term(sb.toString(), offe, TermNatures.NR); term.selfScore = score; return term; }
public void recognition() { if (branch == null) { return; } int length = terms.length - 1; Term term = null; for (int i = 0; i < length; i++) { if (terms[i] == null) { continue; } else { from = terms[i].getFrom(); terms[i].score = 0; terms[i].selfScore = 0; } branch = branch.getBranch(terms[i].getName()); if (branch == null || branch.getStatus() == 3) { reset(); continue; } offe = i; // 循环查找添加 term = terms[i]; sb.append(term.getName()); if (branch.getStatus() == 2) { term.selfScore = branch.getParam().getScore(); } boolean flag = true; while (flag) { term = term.getTo(); branch = branch.getBranch(term.getName()); // 如果没有找到跳出 if (branch == null) { break; } switch (branch.getStatus()) { case 1: sb.append(term.getName()); continue; case 2: sb.append(term.getName()); score = branch.getParam().getScore(); tempNature = branch.getParam().getNature(); to = term.getTo(); makeNewTerm(); continue; case 3: sb.append(term.getName()); score = branch.getParam().getScore(); tempNature = branch.getParam().getNature(); to = term.getTo(); makeNewTerm(); flag = false; break; default: System.out.println("怎么能出现0呢?"); break; } } reset(); } }
/** * 数字+数字合并,zheng * * @param terms */ public static void recognition(Term[] terms) { int length = terms.length - 1; Term from = null; Term to = null; Term temp = null; for (int i = 0; i < length; i++) { if (terms[i] == null) { continue; } else if (".".equals(terms[i].getName())) { // 如果是.前后都为数字进行特殊处理 to = terms[i].getTo(); from = terms[i].getFrom(); if (from.getTermNatures().numAttr.flag && to.getTermNatures().numAttr.flag) { from.setName(from.getName() + "." + to.getName()); TermUtil.termLink(from, to.getTo()); terms[to.getOffe()] = null; terms[i] = null; i = from.getOffe() - 1; } continue; } else if (!terms[i].getTermNatures().numAttr.flag) { continue; } temp = terms[i]; // 将所有的数字合并 while ((temp = temp.getTo()).getTermNatures().numAttr.flag) { terms[i].setName(terms[i].getName() + temp.getName()); } // 如果是数字结尾 if (temp.getTermNatures().numAttr.numEndFreq > 0) { terms[i].setName(terms[i].getName() + temp.getName()); temp = temp.getTo(); } // 如果不等,说明terms[i]发生了改变 if (terms[i].getTo() != temp) { TermUtil.termLink(terms[i], temp); // 将中间无用元素设置为null for (int j = i + 1; j < temp.getOffe(); j++) { terms[j] = null; } i = temp.getOffe() - 1; } } }