Ejemplo n.º 1
0
  /**
   * Test program for demonstrating the Stemmer. It reads text from a a list of files, stems each
   * word, and writes the result to standard output. Note that the word stemmed is expected to be in
   * lower case: forcing lower case must be done outside the Stemmer class. Usage: Stemmer file-name
   * file-name ...
   */
  public String stemFile(FileInputStream in) {
    char[] w = new char[501];
    String stemmed = new String();
    stemmed = "";
    Stemmer s = new Stemmer();
    // try
    // {
    // FileInputStream in = new FileInputStream(filename);
    try {
      while (true) {
        int ch = in.read();
        if (Character.isLetter((char) ch)) {
          int j = 0;
          while (true) {
            ch = Character.toLowerCase((char) ch);
            w[j] = (char) ch;
            if (j < 500) j++;
            ch = in.read();
            // System.out.println(ch);
            if (!Character.isLetter((char) ch)) {
              /* to test add(char ch) */
              for (int c = 0; c < j; c++) s.add(w[c]);

              /* or, to test add(char[] w, int j) */
              /* s.add(w, j); */

              s.stem();
              {
                String u;

                /* and now, to test toString() : */
                u = s.toString();
                stemmed = stemmed + " " + u;
                /* to test getResultBuffer(), getResultLength() : */
                /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */

                //      System.out.print(u);
              }
              break;
            }
          }
        }
        if (ch < 0) break;
        // System.out.print((char)ch);
      }
    } catch (IOException e) {
      System.out.println("error reading ");
    }
    // }
    // catch (FileNotFoundException e) {  System.out.println("file not found");  }
    // catch (Exception e) {  System.out.println("file not found");  }
    return stemmed;
  }
Ejemplo n.º 2
0
  /**
   * Test program for demonstrating the Stemmer. It reads text from a a list of files, stems each
   * word, and writes the result to standard output. Note that the word stemmed is expected to be in
   * lower case: forcing lower case must be done outside the Stemmer class. Usage: Stemmer file-name
   * file-name ...
   */
  public static void main(String[] args) {
    char[] w = new char[501];
    Stemmer s = new Stemmer();
    for (int i = 0; i < args.length; i++)
      try {
        FileInputStream in = new FileInputStream(args[i]);

        try {
          while (true) {
            int ch = in.read();
            if (Character.isLetter((char) ch)) {
              int j = 0;
              while (true) {
                ch = Character.toLowerCase((char) ch);
                w[j] = (char) ch;
                if (j < 500) j++;
                ch = in.read();
                if (!Character.isLetter((char) ch)) {
                  /* to test add(char ch) */
                  for (int c = 0; c < j; c++) s.add(w[c]);

                  /* or, to test add(char[] w, int j) */
                  /* s.add(w, j); */

                  s.stem();
                  {
                    String u;

                    /* and now, to test toString() : */
                    u = s.toString();

                    /* to test getResultBuffer(), getResultLength() : */
                    /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */

                    System.out.print(u);
                  }
                  break;
                }
              }
            }
            if (ch < 0) break;
            System.out.print((char) ch);
          }
        } catch (IOException e) {
          System.out.println("error reading " + args[i]);
          break;
        }
      } catch (FileNotFoundException e) {
        System.out.println("file " + args[i] + " not found");
        break;
      }
  }
Ejemplo n.º 3
0
  // 輸入文章的ID回傳一個tokenize過的ArrayList
  public static ArrayList<String> tokenize(int file) {

    Scanner fileIn = null;
    ArrayList<String> words = new ArrayList<String>();

    try {
      fileIn = new Scanner(new FileInputStream("IRTM/" + file + ".txt"));
    } catch (FileNotFoundException e) {
      System.out.println("File not found.");
      System.exit(0);
    }

    while (fileIn.hasNext()) {
      Stemmer s = new Stemmer();
      String token = fileIn.next().toLowerCase().replaceAll("[^a-zA-Z0-9]", ""); // 將非字母與數字的char清除

      // 濾掉stop word
      boolean isStop = false;
      for (int i = 0; i < stopList.size(); i++) {
        if (token.equals(stopList.get(i)) || token.equals("")) {
          isStop = true;
          break;
        }
      }
      if (!isStop) {
        // 用stemmer將字stem
        for (int k = 0; k < token.length(); k++) {
          s.add(token.charAt(k));
        }
        s.stem();

        // 在濾一次stop word
        isStop = false;
        for (int i = 0; i < stopList.size(); i++) {
          if (s.toString().equals(stopList.get(i)) || s.toString().equals("")) {
            isStop = true;
            break;
          }
        }
        if (!isStop) {
          words.add(s.toString());
        }
      }
    }
    fileIn.close();
    // readStop.close();
    return words;
  }