Пример #1
0
  /* (non-Javadoc)
   * @see jvntextpro.data.DataReader#readString(java.lang.String)
   */
  public List<Sentence> readString(String dataStr) {
    String[] lines = dataStr.split("\n");
    List<Sentence> data = new ArrayList<Sentence>();
    for (String line : lines) {
      Sentence sentence = new Sentence();

      if (line.startsWith("#")) continue;

      StringTokenizer tk = new StringTokenizer(line, " ");
      while (tk.hasMoreTokens()) {
        String word = "", tag = null;
        if (!isTrainReading) {
          String token = tk.nextToken();
          word = token;
          sentence.addTWord(word, tag);
        } else {
          String token = tk.nextToken();
          StringTokenizer sltk = new StringTokenizer(token, "/");
          word = sltk.nextToken();
          tag = sltk.nextToken();
          sentence.addTWord(word, tag);
        }
      }
      data.add(sentence);
    }

    return data;
  }
Пример #2
0
  /* (non-Javadoc)
   * @see jvntextpro.data.DataReader#readFile(java.lang.String)
   */
  @Override
  public List<Sentence> readFile(String datafile) {
    try {
      BufferedReader reader =
          new BufferedReader(new InputStreamReader(new FileInputStream(datafile), "UTF-8"));

      String line = null;
      List<Sentence> data = new ArrayList<Sentence>();

      while ((line = reader.readLine()) != null) {
        Sentence sentence = new Sentence();

        if (line.startsWith("#")) continue;

        StringTokenizer tk = new StringTokenizer(line, " ");
        while (tk.hasMoreTokens()) {
          String word = "", tag = null;
          if (!isTrainReading) {
            String token = tk.nextToken();
            word = token;
            sentence.addTWord(word, tag);
          } else {
            String token = tk.nextToken();
            for (int i = 0; i < tags.length; ++i) {
              String labelPart = "/" + tags[i];
              if (token.endsWith(labelPart)) {
                word = token.substring(0, token.length() - labelPart.length());
                tag = tags[i];
                break;
              }
            }
            sentence.addTWord(word, tag);
          }
        }
        data.add(sentence);
      }
      reader.close();
      return data;
    } catch (Exception e) {
      System.out.println(e.getMessage());
      return new ArrayList<Sentence>(); // empty array
    }
  }