@Test public void test() { String newWord = "爸爸去哪儿"; String nature = "aaaaa"; String str = "上海电力2012年财务报表如下怎爸爸去哪儿么办"; // 增加新词 UserDefineLibrary.insertWord(newWord, nature, 1000); List<Term> parse = ToAnalysis.parse(str); HashMap<String, Term> hs = new HashMap<String, Term>(); for (Term term : parse) { hs.put(term.getName(), term); } Assert.assertTrue(hs.containsKey(newWord)); Assert.assertEquals(hs.get(newWord).natrue().natureStr, nature); // 删除词 UserDefineLibrary.removeWord(newWord); parse = ToAnalysis.parse(str); hs = new HashMap<String, Term>(); for (Term term : parse) { hs.put(term.getName(), term); } Assert.assertTrue(!hs.containsKey(newWord)); }
public List<String> cut(String text) { List<Term> terms = ToAnalysis.parse(text); List<String> array = new ArrayList<>(); StringBuilder sbDebug = new StringBuilder(); for (Term term : terms) { String word = term.getName().trim(); if (word.length() == 0) { continue; } // System.out.println(word); if (filterSingleWord) { // 过滤单字 if (word.length() < 2) { continue; } } if (filterStopWords) { if (stopWordsSet.contains(word)) { continue; } } Matcher matcher = pureNum.matcher(word); // 含数字的词都不要 if (matcher.find()) { continue; } array.add(word); if (debug) { sbDebug.append(word + ","); } } if (debug) { logger.info("cut {} into {}", text, sbDebug.toString()); } return array; }
@Override public EmotionResponse emotionJudge(EmotionRequest req) throws TException { long s1 = System.currentTimeMillis(); String text = req.getText(); List<Term> terms = ToAnalysis.parse(text); List<String> words = new ArrayList<>(); for (Term term : terms) { words.add(term.getName()); } // logger.info("{} is cut into {}", text, words); double score = fbsc.classify(words); EmotionResponse response = new EmotionResponse(); response.setReq(req); response.setScore(score); int emotion = 0; if (score > 0.5) { emotion = 1; } else if (score < -0.5) { emotion = 2; } else { emotion = 3; } response.setEmotion(emotion); long s2 = System.currentTimeMillis(); logger.info("emotion: {}, used:{}ms", emotion, (s2 - s1)); return response; }
public void test() { List<Term> terms = ToAnalysis.parse(line); for (Term term : terms) { String name = term.getName(); System.out.println(name); } }
@Override public List<ParseTerm> parse(String sentence) { // TODO Auto-generated method stub List<ParseTerm> result = new ArrayList<ParseTerm>(); List<Term> list = ToAnalysis.parse(sentence); for (Iterator<Term> it = list.iterator(); it.hasNext(); ) { Term t = it.next(); result.add(new ParseTerm(t.getRealName(), t.getNatureStr())); } return result; }
public String segAnsi(String text) { String segs = ""; List<Term> parse2 = ToAnalysis.parse(text); StringBuilder sb = new StringBuilder(); for (Term term : parse2) { sb.append(term.getName()); sb.append(" "); } segs = sb.toString(); segs = segs.trim(); return segs; }
public static String doSplit(String content) { List<Term> lis = ToAnalysis.parse(content); String res = ""; for (int j = 0; j < lis.size(); j++) { String tem = lis.get(j).toString(); if (tem.charAt(tem.length() - 1) != 'n') continue; String[] ary = new String[100]; ary = tem.split("/"); if (ary.length > 0) tem = ary[0]; res += tem + " "; } return res; }
@Override public void run() { String[] strings = line.split("\t"); if (strings.length == 8) { int hfd = Integer.parseInt(strings[3]); if (hfd > 1) { String content = strings[7]; List<Term> terms = ToAnalysis.parse(content); for (Term term : terms) { String name = term.getName(); System.out.println(name); } } } }
public static Vocabulary loadFromDB(String path) { File file = new File(path); if (!file.exists()) { logger.error("词典文件不存在, {}", path); } String text; ToAnalysis.parse("hello world for acc load speed"); try { text = FileUtils.readFileToString(file); Gson gson = new Gson(); Vocabulary voc = gson.fromJson(text, Vocabulary.class); return voc; } catch (IOException e) { logger.error("gson转化失败"); return null; } }
@Override public HashMap<String, TermScore> tokenizeTerm(String input_str) { tokens = ToAnalysis.parse(input_str); token_iterator = tokens.listIterator(); HashMap<String, TermScore> hash = new HashMap<String, TermScore>(); while (token_iterator.hasNext()) { Term term = token_iterator.next(); if (hash.get(term.getName()) == null) hash.put(term.getName(), new TermScore(term.getName(), 0)); else { TermScore exist_term = hash.get(term.getName()); int new_score = exist_term.getScore() + 1; exist_term.setScore(new_score); hash.put(term.getName(), exist_term); } } return hash; }
public void tokenize(String input_str) { tokens = ToAnalysis.parse(input_str); token_iterator = tokens.listIterator(); }
public static List<Unit> create(String query) throws Exception { // TODO Auto-generated method stub /* DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); Document document = db.parse(new File(Path + "req_result.xml")); NodeList list = document.getElementsByTagName("Pro"); */ if (hasInit == 0) { init(); hasInit = 1; } File file = new File(Path + "TF-IDF_result_x.txt"); OutputStreamWriter write = new OutputStreamWriter(new FileOutputStream(file), "utf-8"); BufferedWriter writer = new BufferedWriter(write); System.out.println(query); List<Term> lis = ToAnalysis.parse(query); List<List<Unit>> res1 = new ArrayList<List<Unit>>(); // List<List<Unit>> res2 = new ArrayList<List<Unit>>(); for (int i = 0; i < lis.size(); i++) { String tem = lis.get(i).toString(); System.out.println(tem); String[] ary = tem.split("/"); String term = ""; double e = ((ary[1].contains("w") || ary[1].contains("nr")) ? 100 : 1); if (!Stopwords.isstop(ary[0]) && ary.length > 0) { term = ary[0]; System.out.println(e); List<Unit> t1 = TFIDF(term); res1.add(Normal(t1, e)); // List<Unit> t2 = BM25(term); // res2.add(Normal(t2)); } } Map<String, Double> res = new HashMap<String, Double>(); for (int i = 0; i < res1.size(); i++) { for (int j = 0; j < res1.get(i).size(); j++) { String iDoc = res1.get(i).get(j).getDocId(); if (res.containsKey(iDoc)) { double tem = res.get(iDoc); res.remove(iDoc); res.put(iDoc, tem + res1.get(i).get(j).getSrc()); } else { res.put(iDoc, res1.get(i).get(j).getSrc()); } } } List<Unit> iRes = new ArrayList<Unit>(); for (String key : res.keySet()) { Unit t = new Unit(); int id = docmap.get(key); t.setDocId(key); t.setChName(docs[id].getChName()); t.setAddr(docs[id].getAddr()); t.setURL(docs[id].getURL()); t.setType(docs[id].getType()); t.setSrc(res.get(key)); iRes.add(t); } Collections.sort(iRes); System.out.println("size = " + iRes.size()); for (int i = 0; i < iRes.size(); i++) { writer.write(iRes.get(i).getDocId() + " " + iRes.get(i).getSrc() + "\n"); } writer.close(); /* double maxx = 0,minn = 1e11; for(int i = 0;i < res.size();i ++){ //writer.write(res.get(i).getDocId() + " " + res.get(i).getNum() + "\n"); double tem = res.get(i).getNum(); maxx = Math.max(maxx, tem);minn = Math.min(minn, tem); } for(int i = 0;i < res.size();i ++){ double tem = res.get(i).getNum(); res.get(i).setNum(10 * (tem - minn) / (maxx - minn)); writer.write(res.get(i).getDocId() + " " + res.get(i).getNum() + "\n"); } writer.close(); writer.close(); file = new File("TF-IDF_result.txt"); write = new OutputStreamWriter(new FileOutputStream(file),"utf-8"); writer = new BufferedWriter(write); */ System.out.println("done"); return iRes; }
public static void main(String[] args) { System.out.println(ToAnalysis.parse("一次性交纳五百元送话费,法轮功")); System.out.println(NlpAnalysis.parse("一次性交纳五百元送话费,法轮功")); }
public static void main(String[] args) { /* for(int i = 50; i < 52; i ++){ File file = new File("C:/Users/hzzhangchi/Desktop/Health/"+i+".txt"); texts = texts +" "+ txt2String(file); } */ File DictFile = new File("C:/Users/hzzhangchi/Desktop/28.txt"); String DictString = txt2String(DictFile); String[] Dict = DictString.split(";"); // 这样才能得到正确的结果 // String[] Dict = // {"美国","健美","中","十","早餐","摄入","减少","少","吃","身体","运动","运动量","纤维素","食物","不易","鱼","肉","鸡肉","香皂","清爽","营养","最佳","新","危险","愈","久","厚","暗","沉","粉刺","慢","小时","富有","35岁","下降","深层","皱纹","刀片","部","受损","保护","紧","绷","灼热","胡须","刮","前","面临","电脑","净化","不吃","节食","意味着","导致","食品","含量","干燥","润肤霜","锻炼","清晨","体重","吸收","产品","肤色","配方","弹性","减肥","健康","脂肪","胆固醇","低","油性","护理","刺激","新陈代谢","剃","高","时","油","毒素","——","角质","男士","层","肌肤","女性","须","误区","男性","皮肤"}; System.out.println(Dict[0]); String texts = null; /* int z = 10; int y = 1; File file = null; while(z<100){ switch(y) { case 1: file = new File("C:/Users/hzzhangchi/Desktop/Health/"+z+".txt"); case 2: file = new File("C:/Users/hzzhangchi/Desktop/Military/"+z+".txt"); //case 3: file = new File("C:/Users/hzzhangchi/Desktop/Net/"+z+".txt"); //case 4: file = new File("C:/Users/hzzhangchi/Desktop/Culture/"+z+".txt"); //case 5: file = new File("C:/Users/hzzhangchi/Desktop/Money/"+z+".txt"); } */ for (int z = 500; z < 1000; z++) { File file = new File("C:/Users/hzzhangchi/Desktop/Health/" + z + ".txt"); texts = txt2String(file); List<Term> al = ToAnalysis.parse(texts); String[] Words = new String[al.size()]; int j = 0; for (Iterator<Term> i = al.iterator(); i.hasNext(); ) { if (j <= al.size()) { Words[j] = i.next().toString(); } else { i.next(); } j = j + 1; } Map<String, Integer> map = new LinkedHashMap(); for (int i = 0; i < Dict.length; i++) { map.put(Dict[i], 0); } for (int i = 0; i < al.size(); i++) { if (map.containsKey(Words[i])) { map.put(Words[i], map.get(Words[i]) + 1); } } // printMap(map); // map = sortMap(map); // printMap(map); // System.out.println(map); try { String line = System.getProperty("line.separator"); StringBuffer str = new StringBuffer(); FileWriter fw = new FileWriter("C:/Users/hzzhangchi/Desktop/29.txt", true); Set set = map.entrySet(); Iterator iter = set.iterator(); str.append("1 "); int count = 1; while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); // if(Integer.parseInt(entry.getValue().toString())>1){ // str.append("\""+entry.getKey()+"\","); // } // str.append(entry.getKey()).append(line); // str.append(entry.getKey()+":"+entry.getValue()+" "); str.append(count + ":" + entry.getValue() + " "); count = count + 1; } str.append(line); fw.write(str.toString()); fw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } /* z = z +1; if(z == 100){ if(y<2){ y=y+1; z=10; } } */ } for (int z = 500; z < 1000; z++) { File file = new File("C:/Users/hzzhangchi/Desktop/Military/" + z + ".txt"); texts = txt2String(file); List<Term> al = ToAnalysis.parse(texts); String[] Words = new String[al.size()]; int j = 0; for (Iterator<Term> i = al.iterator(); i.hasNext(); ) { if (j <= al.size()) { Words[j] = i.next().toString(); } else { i.next(); } j = j + 1; } Map<String, Integer> map = new LinkedHashMap(); for (int i = 0; i < Dict.length; i++) { map.put(Dict[i], 0); } for (int i = 0; i < al.size(); i++) { if (map.containsKey(Words[i])) { map.put(Words[i], map.get(Words[i]) + 1); } } // printMap(map); // map = sortMap(map); // printMap(map); // System.out.println(map); try { String line = System.getProperty("line.separator"); StringBuffer str = new StringBuffer(); FileWriter fw = new FileWriter("C:/Users/hzzhangchi/Desktop/29.txt", true); Set set = map.entrySet(); Iterator iter = set.iterator(); str.append("2 "); int count = 1; while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); // if(Integer.parseInt(entry.getValue().toString())>1){ // str.append("\""+entry.getKey()+"\","); // } // str.append(entry.getKey()).append(line); // str.append(entry.getKey()+":"+entry.getValue()+" "); str.append(count + ":" + entry.getValue() + " "); count = count + 1; } str.append(line); fw.write(str.toString()); fw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } /* z = z +1; if(z == 100){ if(y<2){ y=y+1; z=10; } } */ } for (int z = 500; z < 1000; z++) { File file = new File("C:/Users/hzzhangchi/Desktop/Net/" + z + ".txt"); texts = txt2String(file); List<Term> al = ToAnalysis.parse(texts); String[] Words = new String[al.size()]; int j = 0; for (Iterator<Term> i = al.iterator(); i.hasNext(); ) { if (j <= al.size()) { Words[j] = i.next().toString(); } else { i.next(); } j = j + 1; } Map<String, Integer> map = new LinkedHashMap(); for (int i = 0; i < Dict.length; i++) { map.put(Dict[i], 0); } for (int i = 0; i < al.size(); i++) { if (map.containsKey(Words[i])) { map.put(Words[i], map.get(Words[i]) + 1); } } // printMap(map); // map = sortMap(map); // printMap(map); // System.out.println(map); try { String line = System.getProperty("line.separator"); StringBuffer str = new StringBuffer(); FileWriter fw = new FileWriter("C:/Users/hzzhangchi/Desktop/29.txt", true); Set set = map.entrySet(); Iterator iter = set.iterator(); str.append("3 "); int count = 1; while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); // if(Integer.parseInt(entry.getValue().toString())>1){ // str.append("\""+entry.getKey()+"\","); // } // str.append(entry.getKey()).append(line); // str.append(entry.getKey()+":"+entry.getValue()+" "); str.append(count + ":" + entry.getValue() + " "); count = count + 1; } str.append(line); fw.write(str.toString()); fw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } /* z = z +1; if(z == 100){ if(y<2){ y=y+1; z=10; } } */ } for (int z = 500; z < 1000; z++) { File file = new File("C:/Users/hzzhangchi/Desktop/Culture/" + z + ".txt"); texts = txt2String(file); List<Term> al = ToAnalysis.parse(texts); String[] Words = new String[al.size()]; int j = 0; for (Iterator<Term> i = al.iterator(); i.hasNext(); ) { if (j <= al.size()) { Words[j] = i.next().toString(); } else { i.next(); } j = j + 1; } Map<String, Integer> map = new LinkedHashMap(); for (int i = 0; i < Dict.length; i++) { map.put(Dict[i], 0); } for (int i = 0; i < al.size(); i++) { if (map.containsKey(Words[i])) { map.put(Words[i], map.get(Words[i]) + 1); } } // printMap(map); // map = sortMap(map); // printMap(map); // System.out.println(map); try { String line = System.getProperty("line.separator"); StringBuffer str = new StringBuffer(); FileWriter fw = new FileWriter("C:/Users/hzzhangchi/Desktop/29.txt", true); Set set = map.entrySet(); Iterator iter = set.iterator(); str.append("4 "); int count = 1; while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); // if(Integer.parseInt(entry.getValue().toString())>1){ // str.append("\""+entry.getKey()+"\","); // } // str.append(entry.getKey()).append(line); // str.append(entry.getKey()+":"+entry.getValue()+" "); str.append(count + ":" + entry.getValue() + " "); count = count + 1; } str.append(line); fw.write(str.toString()); fw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } /* z = z +1; if(z == 100){ if(y<2){ y=y+1; z=10; } } */ } for (int z = 500; z < 1000; z++) { File file = new File("C:/Users/hzzhangchi/Desktop/Money/" + z + ".txt"); texts = txt2String(file); List<Term> al = ToAnalysis.parse(texts); String[] Words = new String[al.size()]; int j = 0; for (Iterator<Term> i = al.iterator(); i.hasNext(); ) { if (j <= al.size()) { Words[j] = i.next().toString(); } else { i.next(); } j = j + 1; } Map<String, Integer> map = new LinkedHashMap(); for (int i = 0; i < Dict.length; i++) { map.put(Dict[i], 0); } for (int i = 0; i < al.size(); i++) { if (map.containsKey(Words[i])) { map.put(Words[i], map.get(Words[i]) + 1); } } // printMap(map); // map = sortMap(map); // printMap(map); // System.out.println(map); try { String line = System.getProperty("line.separator"); StringBuffer str = new StringBuffer(); FileWriter fw = new FileWriter("C:/Users/hzzhangchi/Desktop/29.txt", true); Set set = map.entrySet(); Iterator iter = set.iterator(); str.append("5 "); int count = 1; while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); // if(Integer.parseInt(entry.getValue().toString())>1){ // str.append("\""+entry.getKey()+"\","); // } // str.append(entry.getKey()).append(line); // str.append(entry.getKey()+":"+entry.getValue()+" "); str.append(count + ":" + entry.getValue() + " "); count = count + 1; } str.append(line); fw.write(str.toString()); fw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } /* z = z +1; if(z == 100){ if(y<2){ y=y+1; z=10; } } */ } System.out.println(1); }