public static void main(String[] args) { // 语言分片 System.out.println(HanLP.segment("你好,欢迎使用HanLP汉语处理包!")); // 标准分词 List<Term> termList = StandardTokenizer.segment("商品和服务"); System.out.println(termList); // NLP 分词 执行全部命名实体识别和词性标注 List<Term> termList2 = NLPTokenizer.segment("中国科学院计算技术研究所的宗成庆教授正在教授自然语言处理课程"); System.out.println(termList2); // 索引分词 是面向搜索引擎的分词器,能够对长词全切分,另外通过term.offset可以获取单词在文本中的偏移量。 List<Term> termList3 = IndexTokenizer.segment("主副食品"); for (Term term : termList3) { System.out.println( term + " [" + term.offset + ":" + (term.offset + term.word.length()) + "]"); } }
@Override public void actionPerformed(ActionEvent e) { // TODO Auto-generated method stub if (e.getSource() == openFile) { clearData(); JFileChooser fc = new JFileChooser(); fc.showOpenDialog(null); int value = 0; if (value == JFileChooser.APPROVE_OPTION) { String filename = null; String filedirectory = null; String file_and_directoryname = null; if (fc.getSelectedFile() == null) { return; } filename = fc.getSelectedFile().getName(); filedirectory = fc.getCurrentDirectory().toString(); file_and_directoryname = filedirectory + "/" + filename; try { FileInputStream filein = new FileInputStream(file_and_directoryname); byte[] b = new byte[filein.available()]; filein.read(b); fileContent0 = new String(b); fileContent1 = fileContent0.replace(" ", ""); editorDockable1.text.setText(fileContent1); filein.close(); } catch (FileNotFoundException e1) { e1.printStackTrace(); } catch (IOException e1) { e1.printStackTrace(); } } } else if (e.getSource() == all) { if (fileContent1 == null || fileContent1 == "" || fileContent1 == " ") { JOptionPane.showMessageDialog(null, "请输入待处理文本"); fileContent1 = null; } else if (fileContent1 != null || fileContent1 != "") { // NLPTokenizer.SEGMENT.enableNumberQuantifierRecognize(true); // NLPTokenizer.SEGMENT.enableOffset(true); String simpleContent = HanLP.convertToSimplifiedChinese(fileContent1); // termList = NLPTokenizer.segment(simpleContent); termList = NLPTokenizer.segment(simpleContent); keywordList = HanLP.extractKeyword(simpleContent, 15); keywordNum = keywordList.size(); sentenceList = HanLP.extractSummary(simpleContent, 5); sentenceNum = sentenceList.size(); summaryList = HanLP.extractSummary(simpleContent, 4); keywordFreList.clear(); for (int i = 0; i < keywordList.size(); i++) { setColorss(keywordList.get(i), Color.red); keywordFreList.add(keywordFre); keywordFre = 0; } fileContent2 = "关键词频率数组" + keywordFreList; fileContent4 = data2 + "\n" + termList.toString(); fileContent3 = data3 + "\n" + keywordList + "\n" + data6 + "\n" + keywordNum + "\n" + data4 + "\n" + sentenceList + "\n" + data7 + "\n" + sentenceNum + "\n" + data5 + "\n" + summaryList; editorDockable4.text.setText(fileContent4); editorDockable3.text.setText(fileContent3); editorDockable2.text.setText(fileContent2); editorDockable5.text.setText(sentenceList.toString()); editorDockable3.text.setCaretPosition( editorDockable3.text.getText().indexOf(keywordList.toString())); editorDockable4.text.setCaretPosition( editorDockable4.text.getText().indexOf(termList.toString())); } } else if (e.getSource() == tagging || e.getSource() == keyword || e.getSource() == keysentence || e.getSource() == summary) { if (fileContent1 != null) { // NLPTokenizer.SEGMENT.enableNumberQuantifierRecognize(true); String simpleContent = HanLP.convertToSimplifiedChinese(fileContent1); if (e.getSource() == tagging) { // termList = NLPTokenizer.segment(simpleContent); // Segment nShortSegment=new NShortSegment().enable termList = NLPTokenizer.segment(simpleContent); fileContent3 = ""; fileContent4 = data2 + "\n" + termList.toString(); editorDockable4.text.setText(fileContent4); editorDockable3.text.setText(fileContent3); editorDockable3.text.setCaretPosition( editorDockable3.text.getText().indexOf(fileContent3.toString())); editorDockable4.text.setCaretPosition( editorDockable4.text.getText().indexOf(termList.toString())); } else if (e.getSource() == keyword) { keywordList = HanLP.extractKeyword(simpleContent, 15); keywordNum = keywordList.size(); fileContent3 = data3 + "\n" + keywordList + "\n" + data6 + "\n" + keywordNum; editorDockable3.text.setText(fileContent3); editorDockable4.text.setText(""); keywordFreList.clear(); for (int i = 0; i < keywordList.size(); i++) { randomColor(); setColorss(keywordList.get(i), new Color(r, g, b)); keywordFreList.add(keywordFre); keywordFre = 0; } fileContent2 = "关键词频率数组" + keywordFreList; editorDockable2.text.setText(fileContent2); editorDockable3.text.setCaretPosition( editorDockable3.text.getText().indexOf(keywordList.toString())); } else if (e.getSource() == keysentence) { sentenceList = HanLP.extractSummary(simpleContent, 5); sentenceNum = sentenceList.size(); fileContent3 = data4 + "\n" + sentenceList + "\n" + data7 + "\n" + sentenceNum; editorDockable3.text.setText(fileContent3); editorDockable3.text.setCaretPosition( editorDockable3.text.getText().indexOf(sentenceList.toString())); editorDockable4.text.setText(""); } else if (e.getSource() == summary) { summaryList = HanLP.extractSummary(simpleContent, 5); // sentenceNum = summaryList.size(); fileContent5 = Pattern.compile("\\b([\\w\\W])\\b") .matcher(summaryList.toString().substring(1, summaryList.toString().length() - 1)) .replaceAll("$1"); editorDockable5.text.setText(fileContent5); // editorDockable3.text.setCaretPosition(editorDockable3.text.getText().indexOf(sentenceList.toString())); editorDockable4.text.setText(""); } } else { JOptionPane.showMessageDialog(null, "请输入待处理文本"); fileContent1 = null; } } else if (e.getSource() == mKeyword) { addKeyword = editorDockable1.text.getSelectedText(); if (addKeyword == null) { JOptionPane.showMessageDialog(null, "请选择关键词"); } else { keywordList.add(addKeyword); keywordNum = keywordList.size(); fileContent3 = data3 + "\n" + keywordList + "\n" + data6 + "\n" + keywordNum; editorDockable3.text.setText(fileContent3); setColorss(keywordList.get(keywordNum - 1), Color.red); keywordFreList.add(keywordFre); fileContent2 = "关键词频率" + "\n" + keywordFre + "\n" + "关键词频率数组" + "\n" + keywordFreList; // 统计的关键词频率 editorDockable2.text.setText(fileContent2); editorDockable3.text.setCaretPosition( editorDockable3.text.getText().indexOf(keywordList.toString())); editorDockable4.text.setText(""); keywordFre = 0; } } else if (e.getSource() == mSentence) { addKeysentence = editorDockable1.text.getSelectedText(); if (addKeysentence == null) { JOptionPane.showMessageDialog(null, "请选择关键句"); } else { sentenceList.add(addKeysentence); sentenceNum = sentenceList.size(); fileContent3 = data4 + "\n" + sentenceList + "\n" + data7 + "\n" + sentenceNum; editorDockable3.text.setText(fileContent3); editorDockable3.text.setCaretPosition(editorDockable3.text.getText().indexOf(data7)); editorDockable4.text.setText(""); } } else if (e.getSource() == mSummary) { // new SummaryText(); getSummary = null; editorDockable1.text.getCaretPosition(); getSummary = editorDockable1.text.getSelectedText(); editorDockable4.text.setText(getSummary); if (getSummary == null) { // summaryText = // editorDockable5.text.getText().substring(summaryList.toString().length()-1); summaryText = editorDockable5.text.getText(); summaryList.clear(); summaryList.add(summaryText); fileContent3 = data5 + "\n" + summaryList; editorDockable3.text.setText(fileContent3); editorDockable4.text.setText(summaryText); } else if (getSummary != null) { summaryList.add(getSummary); getSummary = null; String ss = Pattern.compile("\\b([\\w\\W])\\b") .matcher(summaryList.toString().substring(1, summaryList.toString().length() - 1)) .replaceAll("$1"); editorDockable5.text.setText(ss); fileContent3 = data5 + "\n" + summaryList; editorDockable3.text.setText(fileContent3); } } else if (e.getSource() == showAll) { fileContent4 = data2 + "\n" + termList.toString(); fileContent3 = data3 + "\n" + keywordList + "\n" + data6 + "\n" + keywordNum + "\n" + data4 + "\n" + sentenceList + "\n" + data7 + "\n" + sentenceNum + "\n" + data5 + "\n" + summaryList; editorDockable3.text.setText(fileContent3); editorDockable4.text.setText(fileContent4); editorDockable3.text.setCaretPosition( editorDockable3.text.getText().indexOf(keywordList.toString())); editorDockable4.text.setCaretPosition( editorDockable4.text.getText().indexOf(termList.toString())); } else if (e.getSource() == revokeKeyword) { if (keywordList.size() == 0) { JOptionPane.showMessageDialog(mSummary, "暂时没有关键词,不需撤销!"); } else { setColorss(keywordList.get(keywordNum - 1), Color.DARK_GRAY); keywordList.remove(keywordList.size() - 1); keywordNum = keywordList.size(); keywordFreList.remove(keywordFreList.size() - 1); if (keywordFreList.size() == 0) { fileContent2 = "关键词频率" + "\n" + "null" + "\n" + "关键词频率数组" + "\n" + keywordFreList; } else { fileContent2 = "关键词频率" + "\n" + keywordFreList.get(keywordFreList.size() - 1) + "\n" + "关键词频率数组" + "\n" + keywordFreList; } fileContent3 = data3 + "\n" + keywordList + "\n" + data6 + "\n" + keywordNum + "\n" + data4 + "\n" + sentenceList + "\n" + data7 + "\n" + sentenceNum; editorDockable2.text.setText(fileContent2); editorDockable3.text.setText(fileContent3); editorDockable4.text.setText(fileContent4); editorDockable3.text.setCaretPosition( editorDockable3.text.getText().indexOf(keywordList.toString())); // editorDockable4.text.setCaretPosition(editorDockable4.text.getText().indexOf(termList.toString())); } } else if (e.getSource() == revokeSentence) { if (sentenceList.size() == 0) { JOptionPane.showMessageDialog(mSummary, "暂时没有关键句,不需撤销!"); } else { sentenceList.remove(sentenceList.size() - 1); sentenceNum = sentenceList.size(); fileContent3 = data3 + "\n" + keywordList + "\n" + data6 + "\n" + keywordNum + "\n" + data4 + "\n" + sentenceList + "\n" + data7 + "\n" + sentenceNum; editorDockable3.text.setText(fileContent3); editorDockable4.text.setText(fileContent4); editorDockable3.text.setCaretPosition(editorDockable3.text.getText().indexOf(data7)); // editorDockable4.text.setCaretPosition(editorDockable4.text.getText().indexOf(termList.toString())); } } else if (e.getSource() == revokeSummary) { if (summaryList.size() == 0) { JOptionPane.showMessageDialog(mSummary, "摘要已为空,不需撤销!"); } else { summaryList.remove(summaryList.size() - 1); fileContent3 = data5 + "\n" + summaryList; editorDockable3.text.setText(fileContent3); fileContent5 = Pattern.compile("\\b([\\w\\W])\\b") .matcher(summaryList.toString().substring(1, summaryList.toString().length() - 1)) .replaceAll("$1"); editorDockable5.text.setText(fileContent5); } } else if (e.getSource() == revokeTagging) { if (termList.size() == 0) { JOptionPane.showMessageDialog(mSummary, "没有标注,不可撤销!"); } else { String replace1 = termList.get(termList.size() - 1).toString(); String replace2 = termList.get(termList.size() - 1).word; editorDockable1.text.setText(editorDockable1.text.getText().replaceAll(replace1, replace2)); termList.remove(termList.size() - 1); editorDockable2.text.setText(termList.toString()); } } else if (e.getSource() == clearData) { clearData(); editorDockable1.text.setText(fileContent1); editorDockable2.text.setText(fileContent2); editorDockable3.text.setText(fileContent3); editorDockable4.text.setText(fileContent4); } else if (e.getSource() == wtCollection) { new WordCharacters(); } else if (e.getSource() == setWordCol) { termListTestCol.add("中国/n"); termListTestCol.add("最大/a"); termListTestCol.add("海警船/n"); termListTestCol.add("海警2901/n"); termListTestCol.add("巡航/v"); termListTestCol.add("宣示/v"); termListTestCol.add("耐/ad"); for (int i = 0; i < termListTestCol.size(); i++) { setColorss(termListTestCol.get(i), Color.blue); } } // 手动标注词性 else if (e.getSource() == noun) { editorDockable1.text.getCaretPosition(); String dd = editorDockable1.text.getSelectedText(); editorDockable1.text.replaceSelection(dd + "/n"); setColorss(dd + "/n", Color.blue); Term element = new Term(dd, Nature.n); termList.add(element); editorDockable2.text.setText(termList.toString()); } else if (e.getSource() == adjective) { editorDockable1.text.getCaretPosition(); String dd = editorDockable1.text.getSelectedText(); editorDockable1.text.replaceSelection(dd + "/a"); setColorss(dd + "/a", Color.blue); Term element = new Term(dd, Nature.a); termList.add(element); editorDockable2.text.setText(termList.toString()); } else if (e.getSource() == adverb) { editorDockable1.text.getCaretPosition(); String dd = editorDockable1.text.getSelectedText(); editorDockable1.text.replaceSelection(dd + "/ad"); Term element = new Term(dd, Nature.ad); termList.add(element); editorDockable2.text.setText(termList.toString()); setColorss(dd + "/ad", Color.blue); } else if (e.getSource() == verb) { editorDockable1.text.getCaretPosition(); String dd = editorDockable1.text.getSelectedText(); editorDockable1.text.replaceSelection(dd + "/v"); setColorss(dd + "/v", Color.blue); Term element = new Term(dd, Nature.v); termList.add(element); editorDockable2.text.setText(termList.toString()); } else if (e.getSource() == test) { editorDockable1.text.getCaretPosition(); String dd = editorDockable1.text.getSelectedText(); // Nature nature1=Nature.; editorDockable1.text.replaceSelection(dd + "/a"); Term element = new Term(dd, Nature.a); termList.add(element); fileContent2 = termList.toString(); editorDockable2.text.setText(fileContent2); // setColorss(dd,Color.blue); } else if (e.getSource() == ncs) { editorDockable1.text.getCaretPosition(); String dd = editorDockable1.text.getSelectedText(); editorDockable1.text.replaceSelection(dd + "/ncs"); setColorss(dd + "/ncs", Color.blue); Term element = new Term(dd, Nature.ncs); termList.add(element); editorDockable2.text.setText(termList.toString()); } else if (e.getSource() == saveFile) { try { saveToMongo(); } catch (UnknownHostException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } else if (e.getSource() == readFile) { try { readMongo(); } catch (UnknownHostException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } else if (e.getSource() == deleteFile) { try { deletecorpus(); } catch (UnknownHostException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } else if (e.getSource() == setCorpusType) { // 设置读取时的语料类型(生语料or熟语料) String message = "请选择存入的语料类型!"; String corpusType[] = {"熟语料", "生语料"}; Object obj = JOptionPane.showInputDialog( null, message, "请选择", JOptionPane.INFORMATION_MESSAGE, null, corpusType, "熟语料"); Readcorpustype = ((String) obj); } }