public static void main(String[] args) { String[] testCase = new String[] { "商品和服务", "结婚的和尚未结婚的确实在干扰分词啊", "买水果然后来世博园最后去世博会", "中国的首都是北京", "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。", }; for (String sentence : testCase) { List<Term> termList = HanLP.segment(sentence); System.out.println(termList); } }
public static void main(String[] args) { // 语言分片 System.out.println(HanLP.segment("你好,欢迎使用HanLP汉语处理包!")); // 标准分词 List<Term> termList = StandardTokenizer.segment("商品和服务"); System.out.println(termList); // NLP 分词 执行全部命名实体识别和词性标注 List<Term> termList2 = NLPTokenizer.segment("中国科学院计算技术研究所的宗成庆教授正在教授自然语言处理课程"); System.out.println(termList2); // 索引分词 是面向搜索引擎的分词器,能够对长词全切分,另外通过term.offset可以获取单词在文本中的偏移量。 List<Term> termList3 = IndexTokenizer.segment("主副食品"); for (Term term : termList3) { System.out.println( term + " [" + term.offset + ":" + (term.offset + term.word.length()) + "]"); } }
public static void main(String[] args) { CoNLLSentence sentence = HanLP.parseDependency("把市场经济奉行的等价交换原则引入党的生活和国家机关政务活动中"); System.out.println(sentence); // 可以方便地遍历它 for (CoNLLWord word : sentence) { System.out.printf("%s --(%s)--> %s\n", word.LEMMA, word.DEPREL, word.HEAD.LEMMA); } // 也可以直接拿到数组,任意顺序或逆序遍历 CoNLLWord[] wordArray = sentence.getWordArray(); for (int i = wordArray.length - 1; i >= 0; i--) { CoNLLWord word = wordArray[i]; System.out.printf("%s --(%s)--> %s\n", word.LEMMA, word.DEPREL, word.HEAD.LEMMA); } // 还可以直接遍历子树,从某棵子树的某个节点一路遍历到虚根 CoNLLWord head = wordArray[1]; while ((head = head.HEAD) != null) { if (head == CoNLLWord.ROOT) System.out.println(head.LEMMA); else System.out.printf("%s --(%s)--> ", head.LEMMA, head.DEPREL); } }
@Override public void actionPerformed(ActionEvent e) { // TODO Auto-generated method stub if (e.getSource() == openFile) { clearData(); JFileChooser fc = new JFileChooser(); fc.showOpenDialog(null); int value = 0; if (value == JFileChooser.APPROVE_OPTION) { String filename = null; String filedirectory = null; String file_and_directoryname = null; if (fc.getSelectedFile() == null) { return; } filename = fc.getSelectedFile().getName(); filedirectory = fc.getCurrentDirectory().toString(); file_and_directoryname = filedirectory + "/" + filename; try { FileInputStream filein = new FileInputStream(file_and_directoryname); byte[] b = new byte[filein.available()]; filein.read(b); fileContent0 = new String(b); fileContent1 = fileContent0.replace(" ", ""); editorDockable1.text.setText(fileContent1); filein.close(); } catch (FileNotFoundException e1) { e1.printStackTrace(); } catch (IOException e1) { e1.printStackTrace(); } } } else if (e.getSource() == all) { if (fileContent1 == null || fileContent1 == "" || fileContent1 == " ") { JOptionPane.showMessageDialog(null, "请输入待处理文本"); fileContent1 = null; } else if (fileContent1 != null || fileContent1 != "") { // NLPTokenizer.SEGMENT.enableNumberQuantifierRecognize(true); // NLPTokenizer.SEGMENT.enableOffset(true); String simpleContent = HanLP.convertToSimplifiedChinese(fileContent1); // termList = NLPTokenizer.segment(simpleContent); termList = NLPTokenizer.segment(simpleContent); keywordList = HanLP.extractKeyword(simpleContent, 15); keywordNum = keywordList.size(); sentenceList = HanLP.extractSummary(simpleContent, 5); sentenceNum = sentenceList.size(); summaryList = HanLP.extractSummary(simpleContent, 4); keywordFreList.clear(); for (int i = 0; i < keywordList.size(); i++) { setColorss(keywordList.get(i), Color.red); keywordFreList.add(keywordFre); keywordFre = 0; } fileContent2 = "关键词频率数组" + keywordFreList; fileContent4 = data2 + "\n" + termList.toString(); fileContent3 = data3 + "\n" + keywordList + "\n" + data6 + "\n" + keywordNum + "\n" + data4 + "\n" + sentenceList + "\n" + data7 + "\n" + sentenceNum + "\n" + data5 + "\n" + summaryList; editorDockable4.text.setText(fileContent4); editorDockable3.text.setText(fileContent3); editorDockable2.text.setText(fileContent2); editorDockable5.text.setText(sentenceList.toString()); editorDockable3.text.setCaretPosition( editorDockable3.text.getText().indexOf(keywordList.toString())); editorDockable4.text.setCaretPosition( editorDockable4.text.getText().indexOf(termList.toString())); } } else if (e.getSource() == tagging || e.getSource() == keyword || e.getSource() == keysentence || e.getSource() == summary) { if (fileContent1 != null) { // NLPTokenizer.SEGMENT.enableNumberQuantifierRecognize(true); String simpleContent = HanLP.convertToSimplifiedChinese(fileContent1); if (e.getSource() == tagging) { // termList = NLPTokenizer.segment(simpleContent); // Segment nShortSegment=new NShortSegment().enable termList = NLPTokenizer.segment(simpleContent); fileContent3 = ""; fileContent4 = data2 + "\n" + termList.toString(); editorDockable4.text.setText(fileContent4); editorDockable3.text.setText(fileContent3); editorDockable3.text.setCaretPosition( editorDockable3.text.getText().indexOf(fileContent3.toString())); editorDockable4.text.setCaretPosition( editorDockable4.text.getText().indexOf(termList.toString())); } else if (e.getSource() == keyword) { keywordList = HanLP.extractKeyword(simpleContent, 15); keywordNum = keywordList.size(); fileContent3 = data3 + "\n" + keywordList + "\n" + data6 + "\n" + keywordNum; editorDockable3.text.setText(fileContent3); editorDockable4.text.setText(""); keywordFreList.clear(); for (int i = 0; i < keywordList.size(); i++) { randomColor(); setColorss(keywordList.get(i), new Color(r, g, b)); keywordFreList.add(keywordFre); keywordFre = 0; } fileContent2 = "关键词频率数组" + keywordFreList; editorDockable2.text.setText(fileContent2); editorDockable3.text.setCaretPosition( editorDockable3.text.getText().indexOf(keywordList.toString())); } else if (e.getSource() == keysentence) { sentenceList = HanLP.extractSummary(simpleContent, 5); sentenceNum = sentenceList.size(); fileContent3 = data4 + "\n" + sentenceList + "\n" + data7 + "\n" + sentenceNum; editorDockable3.text.setText(fileContent3); editorDockable3.text.setCaretPosition( editorDockable3.text.getText().indexOf(sentenceList.toString())); editorDockable4.text.setText(""); } else if (e.getSource() == summary) { summaryList = HanLP.extractSummary(simpleContent, 5); // sentenceNum = summaryList.size(); fileContent5 = Pattern.compile("\\b([\\w\\W])\\b") .matcher(summaryList.toString().substring(1, summaryList.toString().length() - 1)) .replaceAll("$1"); editorDockable5.text.setText(fileContent5); // editorDockable3.text.setCaretPosition(editorDockable3.text.getText().indexOf(sentenceList.toString())); editorDockable4.text.setText(""); } } else { JOptionPane.showMessageDialog(null, "请输入待处理文本"); fileContent1 = null; } } else if (e.getSource() == mKeyword) { addKeyword = editorDockable1.text.getSelectedText(); if (addKeyword == null) { JOptionPane.showMessageDialog(null, "请选择关键词"); } else { keywordList.add(addKeyword); keywordNum = keywordList.size(); fileContent3 = data3 + "\n" + keywordList + "\n" + data6 + "\n" + keywordNum; editorDockable3.text.setText(fileContent3); setColorss(keywordList.get(keywordNum - 1), Color.red); keywordFreList.add(keywordFre); fileContent2 = "关键词频率" + "\n" + keywordFre + "\n" + "关键词频率数组" + "\n" + keywordFreList; // 统计的关键词频率 editorDockable2.text.setText(fileContent2); editorDockable3.text.setCaretPosition( editorDockable3.text.getText().indexOf(keywordList.toString())); editorDockable4.text.setText(""); keywordFre = 0; } } else if (e.getSource() == mSentence) { addKeysentence = editorDockable1.text.getSelectedText(); if (addKeysentence == null) { JOptionPane.showMessageDialog(null, "请选择关键句"); } else { sentenceList.add(addKeysentence); sentenceNum = sentenceList.size(); fileContent3 = data4 + "\n" + sentenceList + "\n" + data7 + "\n" + sentenceNum; editorDockable3.text.setText(fileContent3); editorDockable3.text.setCaretPosition(editorDockable3.text.getText().indexOf(data7)); editorDockable4.text.setText(""); } } else if (e.getSource() == mSummary) { // new SummaryText(); getSummary = null; editorDockable1.text.getCaretPosition(); getSummary = editorDockable1.text.getSelectedText(); editorDockable4.text.setText(getSummary); if (getSummary == null) { // summaryText = // editorDockable5.text.getText().substring(summaryList.toString().length()-1); summaryText = editorDockable5.text.getText(); summaryList.clear(); summaryList.add(summaryText); fileContent3 = data5 + "\n" + summaryList; editorDockable3.text.setText(fileContent3); editorDockable4.text.setText(summaryText); } else if (getSummary != null) { summaryList.add(getSummary); getSummary = null; String ss = Pattern.compile("\\b([\\w\\W])\\b") .matcher(summaryList.toString().substring(1, summaryList.toString().length() - 1)) .replaceAll("$1"); editorDockable5.text.setText(ss); fileContent3 = data5 + "\n" + summaryList; editorDockable3.text.setText(fileContent3); } } else if (e.getSource() == showAll) { fileContent4 = data2 + "\n" + termList.toString(); fileContent3 = data3 + "\n" + keywordList + "\n" + data6 + "\n" + keywordNum + "\n" + data4 + "\n" + sentenceList + "\n" + data7 + "\n" + sentenceNum + "\n" + data5 + "\n" + summaryList; editorDockable3.text.setText(fileContent3); editorDockable4.text.setText(fileContent4); editorDockable3.text.setCaretPosition( editorDockable3.text.getText().indexOf(keywordList.toString())); editorDockable4.text.setCaretPosition( editorDockable4.text.getText().indexOf(termList.toString())); } else if (e.getSource() == revokeKeyword) { if (keywordList.size() == 0) { JOptionPane.showMessageDialog(mSummary, "暂时没有关键词,不需撤销!"); } else { setColorss(keywordList.get(keywordNum - 1), Color.DARK_GRAY); keywordList.remove(keywordList.size() - 1); keywordNum = keywordList.size(); keywordFreList.remove(keywordFreList.size() - 1); if (keywordFreList.size() == 0) { fileContent2 = "关键词频率" + "\n" + "null" + "\n" + "关键词频率数组" + "\n" + keywordFreList; } else { fileContent2 = "关键词频率" + "\n" + keywordFreList.get(keywordFreList.size() - 1) + "\n" + "关键词频率数组" + "\n" + keywordFreList; } fileContent3 = data3 + "\n" + keywordList + "\n" + data6 + "\n" + keywordNum + "\n" + data4 + "\n" + sentenceList + "\n" + data7 + "\n" + sentenceNum; editorDockable2.text.setText(fileContent2); editorDockable3.text.setText(fileContent3); editorDockable4.text.setText(fileContent4); editorDockable3.text.setCaretPosition( editorDockable3.text.getText().indexOf(keywordList.toString())); // editorDockable4.text.setCaretPosition(editorDockable4.text.getText().indexOf(termList.toString())); } } else if (e.getSource() == revokeSentence) { if (sentenceList.size() == 0) { JOptionPane.showMessageDialog(mSummary, "暂时没有关键句,不需撤销!"); } else { sentenceList.remove(sentenceList.size() - 1); sentenceNum = sentenceList.size(); fileContent3 = data3 + "\n" + keywordList + "\n" + data6 + "\n" + keywordNum + "\n" + data4 + "\n" + sentenceList + "\n" + data7 + "\n" + sentenceNum; editorDockable3.text.setText(fileContent3); editorDockable4.text.setText(fileContent4); editorDockable3.text.setCaretPosition(editorDockable3.text.getText().indexOf(data7)); // editorDockable4.text.setCaretPosition(editorDockable4.text.getText().indexOf(termList.toString())); } } else if (e.getSource() == revokeSummary) { if (summaryList.size() == 0) { JOptionPane.showMessageDialog(mSummary, "摘要已为空,不需撤销!"); } else { summaryList.remove(summaryList.size() - 1); fileContent3 = data5 + "\n" + summaryList; editorDockable3.text.setText(fileContent3); fileContent5 = Pattern.compile("\\b([\\w\\W])\\b") .matcher(summaryList.toString().substring(1, summaryList.toString().length() - 1)) .replaceAll("$1"); editorDockable5.text.setText(fileContent5); } } else if (e.getSource() == revokeTagging) { if (termList.size() == 0) { JOptionPane.showMessageDialog(mSummary, "没有标注,不可撤销!"); } else { String replace1 = termList.get(termList.size() - 1).toString(); String replace2 = termList.get(termList.size() - 1).word; editorDockable1.text.setText(editorDockable1.text.getText().replaceAll(replace1, replace2)); termList.remove(termList.size() - 1); editorDockable2.text.setText(termList.toString()); } } else if (e.getSource() == clearData) { clearData(); editorDockable1.text.setText(fileContent1); editorDockable2.text.setText(fileContent2); editorDockable3.text.setText(fileContent3); editorDockable4.text.setText(fileContent4); } else if (e.getSource() == wtCollection) { new WordCharacters(); } else if (e.getSource() == setWordCol) { termListTestCol.add("中国/n"); termListTestCol.add("最大/a"); termListTestCol.add("海警船/n"); termListTestCol.add("海警2901/n"); termListTestCol.add("巡航/v"); termListTestCol.add("宣示/v"); termListTestCol.add("耐/ad"); for (int i = 0; i < termListTestCol.size(); i++) { setColorss(termListTestCol.get(i), Color.blue); } } // 手动标注词性 else if (e.getSource() == noun) { editorDockable1.text.getCaretPosition(); String dd = editorDockable1.text.getSelectedText(); editorDockable1.text.replaceSelection(dd + "/n"); setColorss(dd + "/n", Color.blue); Term element = new Term(dd, Nature.n); termList.add(element); editorDockable2.text.setText(termList.toString()); } else if (e.getSource() == adjective) { editorDockable1.text.getCaretPosition(); String dd = editorDockable1.text.getSelectedText(); editorDockable1.text.replaceSelection(dd + "/a"); setColorss(dd + "/a", Color.blue); Term element = new Term(dd, Nature.a); termList.add(element); editorDockable2.text.setText(termList.toString()); } else if (e.getSource() == adverb) { editorDockable1.text.getCaretPosition(); String dd = editorDockable1.text.getSelectedText(); editorDockable1.text.replaceSelection(dd + "/ad"); Term element = new Term(dd, Nature.ad); termList.add(element); editorDockable2.text.setText(termList.toString()); setColorss(dd + "/ad", Color.blue); } else if (e.getSource() == verb) { editorDockable1.text.getCaretPosition(); String dd = editorDockable1.text.getSelectedText(); editorDockable1.text.replaceSelection(dd + "/v"); setColorss(dd + "/v", Color.blue); Term element = new Term(dd, Nature.v); termList.add(element); editorDockable2.text.setText(termList.toString()); } else if (e.getSource() == test) { editorDockable1.text.getCaretPosition(); String dd = editorDockable1.text.getSelectedText(); // Nature nature1=Nature.; editorDockable1.text.replaceSelection(dd + "/a"); Term element = new Term(dd, Nature.a); termList.add(element); fileContent2 = termList.toString(); editorDockable2.text.setText(fileContent2); // setColorss(dd,Color.blue); } else if (e.getSource() == ncs) { editorDockable1.text.getCaretPosition(); String dd = editorDockable1.text.getSelectedText(); editorDockable1.text.replaceSelection(dd + "/ncs"); setColorss(dd + "/ncs", Color.blue); Term element = new Term(dd, Nature.ncs); termList.add(element); editorDockable2.text.setText(termList.toString()); } else if (e.getSource() == saveFile) { try { saveToMongo(); } catch (UnknownHostException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } else if (e.getSource() == readFile) { try { readMongo(); } catch (UnknownHostException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } else if (e.getSource() == deleteFile) { try { deletecorpus(); } catch (UnknownHostException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } else if (e.getSource() == setCorpusType) { // 设置读取时的语料类型(生语料or熟语料) String message = "请选择存入的语料类型!"; String corpusType[] = {"熟语料", "生语料"}; Object obj = JOptionPane.showInputDialog( null, message, "请选择", JOptionPane.INFORMATION_MESSAGE, null, corpusType, "熟语料"); Readcorpustype = ((String) obj); } }
/** @author hankcs */ public abstract class AbstractDependencyParser implements IDependencyParser { /** 本Parser使用的分词器,可以自由替换 */ private Segment segment = HanLP.newSegment().enablePartOfSpeechTagging(true); /** 依存关系映射表(可以将英文标签映射为中文) */ private Map<String, String> deprelTranslater; /** 是否自动转换依存关系 */ private boolean enableDeprelTranslater; @Override public CoNLLSentence parse(String sentence) { assert sentence != null; CoNLLSentence output = parse(segment.seg(sentence.toCharArray())); if (enableDeprelTranslater && deprelTranslater != null) { for (CoNLLWord word : output) { String translatedDeprel = deprelTranslater.get(word.DEPREL); word.DEPREL = translatedDeprel; } } return output; } @Override public Segment getSegment() { return segment; } @Override public IDependencyParser setSegment(Segment segment) { this.segment = segment; return this; } @Override public Map<String, String> getDeprelTranslator() { return deprelTranslater; } @Override public IDependencyParser setDeprelTranslator(Map<String, String> deprelTranslator) { this.deprelTranslater = deprelTranslator; return this; } /** * 设置映射表 * * @param deprelTranslatorPath 映射表路径 * @return */ public IDependencyParser setDeprelTranslater(String deprelTranslatorPath) { IOUtil.LineIterator iterator = new IOUtil.LineIterator(deprelTranslatorPath); deprelTranslater = new TreeMap<String, String>(); while (iterator.hasNext()) { String[] args = iterator.next().split("\\s"); deprelTranslater.put(args[0], args[1]); } if (deprelTranslater.size() == 0) { deprelTranslater = null; } return this; } @Override public IDependencyParser enableDeprelTranslator(boolean enable) { enableDeprelTranslater = enable; return this; } }