public void loadData(String srcFilePath, int lablePos, int sentPos, String seperator) throws IOException { Segment segment = MyNLP.newSegment(); // 启用分词器训�? segment.enableIndexMode(true); segment.enablePartOfSpeechTagging(false); segment.enableNameRecognize(true); segment.enablePlaceRecognize(true); segment.enableOrganizationRecognize(true); segment.enableTranslatedNameRecognize(false); segment.enableCustomDictionary(false); segment.enableJapaneseNameRecognize(false); segment.enableAllNamedEntityRecognize(true); BufferedReader br = new BufferedReader(new FileReader(new File(srcFilePath))); String line; int progressIndex = 0; while ((line = br.readLine()) != null) { try { List<Term> segs = segment.seg(line.split(seperator)[sentPos]); // List<Term> segs = segment.seg(MyNLP.extractSummary(line.split(seperator)[sentPos], // 1).get(0)); String label = line.split(seperator)[lablePos]; ; int length = segs.size(); List<String> fieldList = new ArrayList<String>(); for (int i = 0; i < length; ++i) { if (segs.get(i).word.trim().equals("")) { continue; } fieldList.add(segs.get(i).word); Feature feature = new Feature(label, segs.get(i).word); int index = featureList.indexOf(feature); if (index == -1) { featureList.add(feature); featureCountList.add(1); } else { featureCountList.set(index, featureCountList.get(index) + 1); } } if (fieldList.size() > C) C = fieldList.size(); Instance instance = new Instance(label, fieldList); instanceList.add(instance); if (labels.indexOf(label) == -1) labels.add(label); progressIndex++; // System.out.println("Progress" + progressIndex + "/306866"); } catch (Exception e) { // e.printStackTrace(); } } br.close(); }
public void loadCSV(String path) throws IOException { Segment segment = MyNLP.newSegment(); segment.enableIndexMode(true); segment.enablePartOfSpeechTagging(false); segment.enableNameRecognize(true); segment.enablePlaceRecognize(true); segment.enableOrganizationRecognize(true); segment.enableTranslatedNameRecognize(false); segment.enableCustomDictionary(false); segment.enableJapaneseNameRecognize(false); segment.enableAllNamedEntityRecognize(true); BufferedReader br = new BufferedReader(new FileReader(new File(path))); String line = br.readLine(); while (line != null) { String[] csvArray = line.split(";"); if (csvArray.length < 4) { line = br.readLine(); continue; } String content = csvArray[1]; String title = csvArray[2]; String type = csvArray[3]; if (type == null || type.equals("")) { line = br.readLine(); continue; } List<Term> segs = segment.seg(content + ";" + title); String label = type; int length = segs.size(); List<String> fieldList = new ArrayList<String>(); for (int i = 0; i < length; ++i) { if (segs.get(i).word.trim().equals("")) { continue; } fieldList.add(segs.get(i).word); Feature feature = new Feature(label, segs.get(i).word); int index = featureList.indexOf(feature); if (index == -1) { featureList.add(feature); featureCountList.add(1); } else { featureCountList.set(index, featureCountList.get(index) + 1); } } if (fieldList.size() > C) C = fieldList.size(); Instance instance = new Instance(label, fieldList); instanceList.add(instance); if (labels.indexOf(label) == -1) labels.add(label); line = br.readLine(); } // File fl = new File("d:/tmp2Chuge_fl"); // FileWriter fw1 = new FileWriter(fl); // for(int i=0;i<featureList.size();i++){ // fw1.write(featureList.get(i).toString()+"\t"+featureCountList.get(i)+"\n"); // } // fw1.flush(); // fw1.close(); br.close(); }