@Test public void test() { String newWord = "爸爸去哪儿"; String nature = "aaaaa"; String str = "上海电力2012年财务报表如下怎爸爸去哪儿么办"; // 增加新词 UserDefineLibrary.insertWord(newWord, nature, 1000); List<Term> parse = ToAnalysis.parse(str); HashMap<String, Term> hs = new HashMap<String, Term>(); for (Term term : parse) { hs.put(term.getName(), term); } Assert.assertTrue(hs.containsKey(newWord)); Assert.assertEquals(hs.get(newWord).natrue().natureStr, nature); // 删除词 UserDefineLibrary.removeWord(newWord); parse = ToAnalysis.parse(str); hs = new HashMap<String, Term>(); for (Term term : parse) { hs.put(term.getName(), term); } Assert.assertTrue(!hs.containsKey(newWord)); }
public List<String> cut(String text) { List<Term> terms = ToAnalysis.parse(text); List<String> array = new ArrayList<>(); StringBuilder sbDebug = new StringBuilder(); for (Term term : terms) { String word = term.getName().trim(); if (word.length() == 0) { continue; } // System.out.println(word); if (filterSingleWord) { // 过滤单字 if (word.length() < 2) { continue; } } if (filterStopWords) { if (stopWordsSet.contains(word)) { continue; } } Matcher matcher = pureNum.matcher(word); // 含数字的词都不要 if (matcher.find()) { continue; } array.add(word); if (debug) { sbDebug.append(word + ","); } } if (debug) { logger.info("cut {} into {}", text, sbDebug.toString()); } return array; }
@Override public EmotionResponse emotionJudge(EmotionRequest req) throws TException { long s1 = System.currentTimeMillis(); String text = req.getText(); List<Term> terms = ToAnalysis.parse(text); List<String> words = new ArrayList<>(); for (Term term : terms) { words.add(term.getName()); } // logger.info("{} is cut into {}", text, words); double score = fbsc.classify(words); EmotionResponse response = new EmotionResponse(); response.setReq(req); response.setScore(score); int emotion = 0; if (score > 0.5) { emotion = 1; } else if (score < -0.5) { emotion = 2; } else { emotion = 3; } response.setEmotion(emotion); long s2 = System.currentTimeMillis(); logger.info("emotion: {}, used:{}ms", emotion, (s2 - s1)); return response; }
public void test() { List<Term> terms = ToAnalysis.parse(line); for (Term term : terms) { String name = term.getName(); System.out.println(name); } }
@Override public List<ParseTerm> parse(String sentence) { // TODO Auto-generated method stub List<ParseTerm> result = new ArrayList<ParseTerm>(); List<Term> list = ToAnalysis.parse(sentence); for (Iterator<Term> it = list.iterator(); it.hasNext(); ) { Term t = it.next(); result.add(new ParseTerm(t.getRealName(), t.getNatureStr())); } return result; }
public String segAnsi(String text) { String segs = ""; List<Term> parse2 = ToAnalysis.parse(text); StringBuilder sb = new StringBuilder(); for (Term term : parse2) { sb.append(term.getName()); sb.append(" "); } segs = sb.toString(); segs = segs.trim(); return segs; }
public static String doSplit(String content) { List<Term> lis = ToAnalysis.parse(content); String res = ""; for (int j = 0; j < lis.size(); j++) { String tem = lis.get(j).toString(); if (tem.charAt(tem.length() - 1) != 'n') continue; String[] ary = new String[100]; ary = tem.split("/"); if (ary.length > 0) tem = ary[0]; res += tem + " "; } return res; }
@Override public void run() { String[] strings = line.split("\t"); if (strings.length == 8) { int hfd = Integer.parseInt(strings[3]); if (hfd > 1) { String content = strings[7]; List<Term> terms = ToAnalysis.parse(content); for (Term term : terms) { String name = term.getName(); System.out.println(name); } } } }
public static Vocabulary loadFromDB(String path) { File file = new File(path); if (!file.exists()) { logger.error("词典文件不存在, {}", path); } String text; ToAnalysis.parse("hello world for acc load speed"); try { text = FileUtils.readFileToString(file); Gson gson = new Gson(); Vocabulary voc = gson.fromJson(text, Vocabulary.class); return voc; } catch (IOException e) { logger.error("gson转化失败"); return null; } }
@Override public HashMap<String, TermScore> tokenizeTerm(String input_str) { tokens = ToAnalysis.parse(input_str); token_iterator = tokens.listIterator(); HashMap<String, TermScore> hash = new HashMap<String, TermScore>(); while (token_iterator.hasNext()) { Term term = token_iterator.next(); if (hash.get(term.getName()) == null) hash.put(term.getName(), new TermScore(term.getName(), 0)); else { TermScore exist_term = hash.get(term.getName()); int new_score = exist_term.getScore() + 1; exist_term.setScore(new_score); hash.put(term.getName(), exist_term); } } return hash; }
public static void main(String[] args) throws Exception { List<String> list = new ArrayList<String>(); String str = null; list.add( "李宇春《再不疯狂我们就老了》MV首播】李宇春新专辑同名第二主打《再不疯狂我们就老了》MV今日正式发布。这首歌与《似火年华》,以“疯狂”为概念的对话曲目,采用一曲双词的方式。李宇春与韩寒,同时在一首歌里,讲述了两种截然相反,却本质同归的态度"); list.add("上个月在天津术语学会上见到冯老,言谈中感觉到冯老对机器翻译的深厚感情和殷切希望。是啊,机器翻译事业还年轻,我辈细流,心驰沧海,愿倾尽绵薄之力,浇灌此常青之树。"); list.add( "发表了博文 《多语言信息网络时代的语言学家:冯志伟》 - 冯志伟与老伴郑初阳 多语言信息网络时代的语言学家:冯志伟 桂清扬 冯志伟,教育部语言文字应用研究所研究员,博士生导师,所学术委员会"); list.add( "Facebook CEO 马克·扎克伯格亮相了周二 TechCrunch Disrupt 大会,并针对公司不断下挫的股价、移动战略、广告业务等方面发表了讲话。自 5 月公司 IPO 后,扎克伯格极少公开露面,这也是他首次在重要场合公开接受采访"); list.add( "@新华社中国网事:#聚焦钓鱼岛#外交部长杨洁篪10日在外交部紧急召见日本驻华大使丹羽宇一郎,就日本政府非法“购买”钓鱼岛提出严正交涉和强烈抗议。当日,中国驻日本大使程永华也向日本外务省负责人提出严正交涉并递交了抗议照会。"); list.add( "阿米尔汗,8岁时出演一部轰动印度的电影,是公认的童星,长大后却一心打网球并获得过网球冠军。21岁爱上邻居家女孩,由于宗教原因两人决定私奔,现在过着幸福美满的生活。81届奥斯卡最佳影片《贫民窟的百万富翁》,他担任制片。2009年一部《三个白痴》震惊全球,他47岁"); list.add("老郭动粗 师徒揭相声虚假繁荣"); list.add("Facebook CEO 扎克伯格极少公开露面"); list.add("徐德有说这是个错误!"); list.add("而如今Facebook的CEO马克·扎克伯格表示,押在HTML5上是Facebook最大的错误。由于HTML5应用性能差到不能忍受"); list.add( "本报讯(记者胡笑红)已经过期的牛奶被销售经理修改日期,照样投放市场销售,记者昨天从蒙牛公司得到证实,蒙牛驻义乌经理王孙富和同伙赵宝峰因涉嫌生产销售伪劣产品罪已被当地批捕。"); list.add("白玉萍是一个好人"); list.add("张三同李四是好朋友"); list.add("钟子期的名字能够被认出来么"); list.add("綦玉冰"); list.add("汤姆克鲁斯的英文名字很苦"); list.add( "曼城第23分钟遭遇打击,孔帕尼中线丢球,莫里森中路直塞,沙恩-朗拿球成单刀之势,米尔纳背后将其铲倒,主裁判克拉滕伯格认为米尔纳是最后一名防守球员,直接掏出红牌!曼奇尼在场边向第四官员抗议,认为莱斯科特已经补防到位。多兰斯主罚任意球打在人墙上高出。"); list.add( "中新网10月20日电 据日本共同社报道,日本民主党代理干事长安住淳20日表示,首相野田佳彦将履行“近期”解散众院举行大选的承诺,预计在“公债发行特例法案”获得通过等条件具备时解散众院。"); for (String string : list) { List<Term> paser = ToAnalysis.paser(string); new NatureRecognition(paser).recogntion(); System.out.println(paser); } // makeFile() ; // initWordFreq() ; }
public void tokenize(String input_str) { tokens = ToAnalysis.parse(input_str); token_iterator = tokens.listIterator(); }
public static List<Unit> create(String query) throws Exception { // TODO Auto-generated method stub /* DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); Document document = db.parse(new File(Path + "req_result.xml")); NodeList list = document.getElementsByTagName("Pro"); */ if (hasInit == 0) { init(); hasInit = 1; } File file = new File(Path + "TF-IDF_result_x.txt"); OutputStreamWriter write = new OutputStreamWriter(new FileOutputStream(file), "utf-8"); BufferedWriter writer = new BufferedWriter(write); System.out.println(query); List<Term> lis = ToAnalysis.parse(query); List<List<Unit>> res1 = new ArrayList<List<Unit>>(); // List<List<Unit>> res2 = new ArrayList<List<Unit>>(); for (int i = 0; i < lis.size(); i++) { String tem = lis.get(i).toString(); System.out.println(tem); String[] ary = tem.split("/"); String term = ""; double e = ((ary[1].contains("w") || ary[1].contains("nr")) ? 100 : 1); if (!Stopwords.isstop(ary[0]) && ary.length > 0) { term = ary[0]; System.out.println(e); List<Unit> t1 = TFIDF(term); res1.add(Normal(t1, e)); // List<Unit> t2 = BM25(term); // res2.add(Normal(t2)); } } Map<String, Double> res = new HashMap<String, Double>(); for (int i = 0; i < res1.size(); i++) { for (int j = 0; j < res1.get(i).size(); j++) { String iDoc = res1.get(i).get(j).getDocId(); if (res.containsKey(iDoc)) { double tem = res.get(iDoc); res.remove(iDoc); res.put(iDoc, tem + res1.get(i).get(j).getSrc()); } else { res.put(iDoc, res1.get(i).get(j).getSrc()); } } } List<Unit> iRes = new ArrayList<Unit>(); for (String key : res.keySet()) { Unit t = new Unit(); int id = docmap.get(key); t.setDocId(key); t.setChName(docs[id].getChName()); t.setAddr(docs[id].getAddr()); t.setURL(docs[id].getURL()); t.setType(docs[id].getType()); t.setSrc(res.get(key)); iRes.add(t); } Collections.sort(iRes); System.out.println("size = " + iRes.size()); for (int i = 0; i < iRes.size(); i++) { writer.write(iRes.get(i).getDocId() + " " + iRes.get(i).getSrc() + "\n"); } writer.close(); /* double maxx = 0,minn = 1e11; for(int i = 0;i < res.size();i ++){ //writer.write(res.get(i).getDocId() + " " + res.get(i).getNum() + "\n"); double tem = res.get(i).getNum(); maxx = Math.max(maxx, tem);minn = Math.min(minn, tem); } for(int i = 0;i < res.size();i ++){ double tem = res.get(i).getNum(); res.get(i).setNum(10 * (tem - minn) / (maxx - minn)); writer.write(res.get(i).getDocId() + " " + res.get(i).getNum() + "\n"); } writer.close(); writer.close(); file = new File("TF-IDF_result.txt"); write = new OutputStreamWriter(new FileOutputStream(file),"utf-8"); writer = new BufferedWriter(write); */ System.out.println("done"); return iRes; }
public static void main(String[] args) { System.out.println(ToAnalysis.parse("一次性交纳五百元送话费,法轮功")); System.out.println(NlpAnalysis.parse("一次性交纳五百元送话费,法轮功")); }
public static void main(String[] args) { /* for(int i = 50; i < 52; i ++){ File file = new File("C:/Users/hzzhangchi/Desktop/Health/"+i+".txt"); texts = texts +" "+ txt2String(file); } */ File DictFile = new File("C:/Users/hzzhangchi/Desktop/28.txt"); String DictString = txt2String(DictFile); String[] Dict = DictString.split(";"); // 这样才能得到正确的结果 // String[] Dict = // {"美国","健美","中","十","早餐","摄入","减少","少","吃","身体","运动","运动量","纤维素","食物","不易","鱼","肉","鸡肉","香皂","清爽","营养","最佳","新","危险","愈","久","厚","暗","沉","粉刺","慢","小时","富有","35岁","下降","深层","皱纹","刀片","部","受损","保护","紧","绷","灼热","胡须","刮","前","面临","电脑","净化","不吃","节食","意味着","导致","食品","含量","干燥","润肤霜","锻炼","清晨","体重","吸收","产品","肤色","配方","弹性","减肥","健康","脂肪","胆固醇","低","油性","护理","刺激","新陈代谢","剃","高","时","油","毒素","——","角质","男士","层","肌肤","女性","须","误区","男性","皮肤"}; System.out.println(Dict[0]); String texts = null; /* int z = 10; int y = 1; File file = null; while(z<100){ switch(y) { case 1: file = new File("C:/Users/hzzhangchi/Desktop/Health/"+z+".txt"); case 2: file = new File("C:/Users/hzzhangchi/Desktop/Military/"+z+".txt"); //case 3: file = new File("C:/Users/hzzhangchi/Desktop/Net/"+z+".txt"); //case 4: file = new File("C:/Users/hzzhangchi/Desktop/Culture/"+z+".txt"); //case 5: file = new File("C:/Users/hzzhangchi/Desktop/Money/"+z+".txt"); } */ for (int z = 500; z < 1000; z++) { File file = new File("C:/Users/hzzhangchi/Desktop/Health/" + z + ".txt"); texts = txt2String(file); List<Term> al = ToAnalysis.parse(texts); String[] Words = new String[al.size()]; int j = 0; for (Iterator<Term> i = al.iterator(); i.hasNext(); ) { if (j <= al.size()) { Words[j] = i.next().toString(); } else { i.next(); } j = j + 1; } Map<String, Integer> map = new LinkedHashMap(); for (int i = 0; i < Dict.length; i++) { map.put(Dict[i], 0); } for (int i = 0; i < al.size(); i++) { if (map.containsKey(Words[i])) { map.put(Words[i], map.get(Words[i]) + 1); } } // printMap(map); // map = sortMap(map); // printMap(map); // System.out.println(map); try { String line = System.getProperty("line.separator"); StringBuffer str = new StringBuffer(); FileWriter fw = new FileWriter("C:/Users/hzzhangchi/Desktop/29.txt", true); Set set = map.entrySet(); Iterator iter = set.iterator(); str.append("1 "); int count = 1; while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); // if(Integer.parseInt(entry.getValue().toString())>1){ // str.append("\""+entry.getKey()+"\","); // } // str.append(entry.getKey()).append(line); // str.append(entry.getKey()+":"+entry.getValue()+" "); str.append(count + ":" + entry.getValue() + " "); count = count + 1; } str.append(line); fw.write(str.toString()); fw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } /* z = z +1; if(z == 100){ if(y<2){ y=y+1; z=10; } } */ } for (int z = 500; z < 1000; z++) { File file = new File("C:/Users/hzzhangchi/Desktop/Military/" + z + ".txt"); texts = txt2String(file); List<Term> al = ToAnalysis.parse(texts); String[] Words = new String[al.size()]; int j = 0; for (Iterator<Term> i = al.iterator(); i.hasNext(); ) { if (j <= al.size()) { Words[j] = i.next().toString(); } else { i.next(); } j = j + 1; } Map<String, Integer> map = new LinkedHashMap(); for (int i = 0; i < Dict.length; i++) { map.put(Dict[i], 0); } for (int i = 0; i < al.size(); i++) { if (map.containsKey(Words[i])) { map.put(Words[i], map.get(Words[i]) + 1); } } // printMap(map); // map = sortMap(map); // printMap(map); // System.out.println(map); try { String line = System.getProperty("line.separator"); StringBuffer str = new StringBuffer(); FileWriter fw = new FileWriter("C:/Users/hzzhangchi/Desktop/29.txt", true); Set set = map.entrySet(); Iterator iter = set.iterator(); str.append("2 "); int count = 1; while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); // if(Integer.parseInt(entry.getValue().toString())>1){ // str.append("\""+entry.getKey()+"\","); // } // str.append(entry.getKey()).append(line); // str.append(entry.getKey()+":"+entry.getValue()+" "); str.append(count + ":" + entry.getValue() + " "); count = count + 1; } str.append(line); fw.write(str.toString()); fw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } /* z = z +1; if(z == 100){ if(y<2){ y=y+1; z=10; } } */ } for (int z = 500; z < 1000; z++) { File file = new File("C:/Users/hzzhangchi/Desktop/Net/" + z + ".txt"); texts = txt2String(file); List<Term> al = ToAnalysis.parse(texts); String[] Words = new String[al.size()]; int j = 0; for (Iterator<Term> i = al.iterator(); i.hasNext(); ) { if (j <= al.size()) { Words[j] = i.next().toString(); } else { i.next(); } j = j + 1; } Map<String, Integer> map = new LinkedHashMap(); for (int i = 0; i < Dict.length; i++) { map.put(Dict[i], 0); } for (int i = 0; i < al.size(); i++) { if (map.containsKey(Words[i])) { map.put(Words[i], map.get(Words[i]) + 1); } } // printMap(map); // map = sortMap(map); // printMap(map); // System.out.println(map); try { String line = System.getProperty("line.separator"); StringBuffer str = new StringBuffer(); FileWriter fw = new FileWriter("C:/Users/hzzhangchi/Desktop/29.txt", true); Set set = map.entrySet(); Iterator iter = set.iterator(); str.append("3 "); int count = 1; while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); // if(Integer.parseInt(entry.getValue().toString())>1){ // str.append("\""+entry.getKey()+"\","); // } // str.append(entry.getKey()).append(line); // str.append(entry.getKey()+":"+entry.getValue()+" "); str.append(count + ":" + entry.getValue() + " "); count = count + 1; } str.append(line); fw.write(str.toString()); fw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } /* z = z +1; if(z == 100){ if(y<2){ y=y+1; z=10; } } */ } for (int z = 500; z < 1000; z++) { File file = new File("C:/Users/hzzhangchi/Desktop/Culture/" + z + ".txt"); texts = txt2String(file); List<Term> al = ToAnalysis.parse(texts); String[] Words = new String[al.size()]; int j = 0; for (Iterator<Term> i = al.iterator(); i.hasNext(); ) { if (j <= al.size()) { Words[j] = i.next().toString(); } else { i.next(); } j = j + 1; } Map<String, Integer> map = new LinkedHashMap(); for (int i = 0; i < Dict.length; i++) { map.put(Dict[i], 0); } for (int i = 0; i < al.size(); i++) { if (map.containsKey(Words[i])) { map.put(Words[i], map.get(Words[i]) + 1); } } // printMap(map); // map = sortMap(map); // printMap(map); // System.out.println(map); try { String line = System.getProperty("line.separator"); StringBuffer str = new StringBuffer(); FileWriter fw = new FileWriter("C:/Users/hzzhangchi/Desktop/29.txt", true); Set set = map.entrySet(); Iterator iter = set.iterator(); str.append("4 "); int count = 1; while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); // if(Integer.parseInt(entry.getValue().toString())>1){ // str.append("\""+entry.getKey()+"\","); // } // str.append(entry.getKey()).append(line); // str.append(entry.getKey()+":"+entry.getValue()+" "); str.append(count + ":" + entry.getValue() + " "); count = count + 1; } str.append(line); fw.write(str.toString()); fw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } /* z = z +1; if(z == 100){ if(y<2){ y=y+1; z=10; } } */ } for (int z = 500; z < 1000; z++) { File file = new File("C:/Users/hzzhangchi/Desktop/Money/" + z + ".txt"); texts = txt2String(file); List<Term> al = ToAnalysis.parse(texts); String[] Words = new String[al.size()]; int j = 0; for (Iterator<Term> i = al.iterator(); i.hasNext(); ) { if (j <= al.size()) { Words[j] = i.next().toString(); } else { i.next(); } j = j + 1; } Map<String, Integer> map = new LinkedHashMap(); for (int i = 0; i < Dict.length; i++) { map.put(Dict[i], 0); } for (int i = 0; i < al.size(); i++) { if (map.containsKey(Words[i])) { map.put(Words[i], map.get(Words[i]) + 1); } } // printMap(map); // map = sortMap(map); // printMap(map); // System.out.println(map); try { String line = System.getProperty("line.separator"); StringBuffer str = new StringBuffer(); FileWriter fw = new FileWriter("C:/Users/hzzhangchi/Desktop/29.txt", true); Set set = map.entrySet(); Iterator iter = set.iterator(); str.append("5 "); int count = 1; while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); // if(Integer.parseInt(entry.getValue().toString())>1){ // str.append("\""+entry.getKey()+"\","); // } // str.append(entry.getKey()).append(line); // str.append(entry.getKey()+":"+entry.getValue()+" "); str.append(count + ":" + entry.getValue() + " "); count = count + 1; } str.append(line); fw.write(str.toString()); fw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } /* z = z +1; if(z == 100){ if(y<2){ y=y+1; z=10; } } */ } System.out.println(1); }
public static void main(String[] args) { String str = "结婚的和尚未结婚的孙建是一个好人"; List<Term> terms = ToAnalysis.paser(str); new NatureRecognition(terms).recognition(); }