Exemplo n.º 1
0
  public void loadData(String srcFilePath, int lablePos, int sentPos, String seperator)
      throws IOException {
    Segment segment = MyNLP.newSegment(); // 启用分词器训�?
    segment.enableIndexMode(true);
    segment.enablePartOfSpeechTagging(false);
    segment.enableNameRecognize(true);
    segment.enablePlaceRecognize(true);
    segment.enableOrganizationRecognize(true);
    segment.enableTranslatedNameRecognize(false);
    segment.enableCustomDictionary(false);
    segment.enableJapaneseNameRecognize(false);
    segment.enableAllNamedEntityRecognize(true);

    BufferedReader br = new BufferedReader(new FileReader(new File(srcFilePath)));
    String line;
    int progressIndex = 0;
    while ((line = br.readLine()) != null) {
      try {
        List<Term> segs = segment.seg(line.split(seperator)[sentPos]);
        // List<Term> segs = segment.seg(MyNLP.extractSummary(line.split(seperator)[sentPos],
        // 1).get(0));
        String label = line.split(seperator)[lablePos];
        ;
        int length = segs.size();
        List<String> fieldList = new ArrayList<String>();
        for (int i = 0; i < length; ++i) {
          if (segs.get(i).word.trim().equals("")) {
            continue;
          }
          fieldList.add(segs.get(i).word);
          Feature feature = new Feature(label, segs.get(i).word);
          int index = featureList.indexOf(feature);
          if (index == -1) {
            featureList.add(feature);
            featureCountList.add(1);
          } else {
            featureCountList.set(index, featureCountList.get(index) + 1);
          }
        }
        if (fieldList.size() > C) C = fieldList.size();
        Instance instance = new Instance(label, fieldList);
        instanceList.add(instance);
        if (labels.indexOf(label) == -1) labels.add(label);
        progressIndex++;
        //				System.out.println("Progress" + progressIndex + "/306866");
      } catch (Exception e) {
        // e.printStackTrace();
      }
    }
    br.close();
  }
Exemplo n.º 2
0
  public void loadCSV(String path) throws IOException {
    Segment segment = MyNLP.newSegment();
    segment.enableIndexMode(true);
    segment.enablePartOfSpeechTagging(false);
    segment.enableNameRecognize(true);
    segment.enablePlaceRecognize(true);
    segment.enableOrganizationRecognize(true);
    segment.enableTranslatedNameRecognize(false);
    segment.enableCustomDictionary(false);
    segment.enableJapaneseNameRecognize(false);
    segment.enableAllNamedEntityRecognize(true);

    BufferedReader br = new BufferedReader(new FileReader(new File(path)));
    String line = br.readLine();
    while (line != null) {
      String[] csvArray = line.split(";");
      if (csvArray.length < 4) {
        line = br.readLine();
        continue;
      }
      String content = csvArray[1];
      String title = csvArray[2];
      String type = csvArray[3];
      if (type == null || type.equals("")) {
        line = br.readLine();
        continue;
      }
      List<Term> segs = segment.seg(content + ";" + title);
      String label = type;
      int length = segs.size();
      List<String> fieldList = new ArrayList<String>();
      for (int i = 0; i < length; ++i) {
        if (segs.get(i).word.trim().equals("")) {
          continue;
        }
        fieldList.add(segs.get(i).word);
        Feature feature = new Feature(label, segs.get(i).word);
        int index = featureList.indexOf(feature);
        if (index == -1) {
          featureList.add(feature);
          featureCountList.add(1);
        } else {
          featureCountList.set(index, featureCountList.get(index) + 1);
        }
      }
      if (fieldList.size() > C) C = fieldList.size();
      Instance instance = new Instance(label, fieldList);
      instanceList.add(instance);
      if (labels.indexOf(label) == -1) labels.add(label);
      line = br.readLine();
    }

    // File fl = new File("d:/tmp2Chuge_fl");
    // FileWriter fw1 = new FileWriter(fl);
    // for(int i=0;i<featureList.size();i++){
    // fw1.write(featureList.get(i).toString()+"\t"+featureCountList.get(i)+"\n");
    // }
    // fw1.flush();
    // fw1.close();
    br.close();
  }