/** * Test program for demonstrating the Stemmer. It reads text from a a list of files, stems each * word, and writes the result to standard output. Note that the word stemmed is expected to be in * lower case: forcing lower case must be done outside the Stemmer class. Usage: Stemmer file-name * file-name ... */ public String stemFile(FileInputStream in) { char[] w = new char[501]; String stemmed = new String(); stemmed = ""; Stemmer s = new Stemmer(); // try // { // FileInputStream in = new FileInputStream(filename); try { while (true) { int ch = in.read(); if (Character.isLetter((char) ch)) { int j = 0; while (true) { ch = Character.toLowerCase((char) ch); w[j] = (char) ch; if (j < 500) j++; ch = in.read(); // System.out.println(ch); if (!Character.isLetter((char) ch)) { /* to test add(char ch) */ for (int c = 0; c < j; c++) s.add(w[c]); /* or, to test add(char[] w, int j) */ /* s.add(w, j); */ s.stem(); { String u; /* and now, to test toString() : */ u = s.toString(); stemmed = stemmed + " " + u; /* to test getResultBuffer(), getResultLength() : */ /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */ // System.out.print(u); } break; } } } if (ch < 0) break; // System.out.print((char)ch); } } catch (IOException e) { System.out.println("error reading "); } // } // catch (FileNotFoundException e) { System.out.println("file not found"); } // catch (Exception e) { System.out.println("file not found"); } return stemmed; }
/** * Test program for demonstrating the Stemmer. It reads text from a a list of files, stems each * word, and writes the result to standard output. Note that the word stemmed is expected to be in * lower case: forcing lower case must be done outside the Stemmer class. Usage: Stemmer file-name * file-name ... */ public static void main(String[] args) { char[] w = new char[501]; Stemmer s = new Stemmer(); for (int i = 0; i < args.length; i++) try { FileInputStream in = new FileInputStream(args[i]); try { while (true) { int ch = in.read(); if (Character.isLetter((char) ch)) { int j = 0; while (true) { ch = Character.toLowerCase((char) ch); w[j] = (char) ch; if (j < 500) j++; ch = in.read(); if (!Character.isLetter((char) ch)) { /* to test add(char ch) */ for (int c = 0; c < j; c++) s.add(w[c]); /* or, to test add(char[] w, int j) */ /* s.add(w, j); */ s.stem(); { String u; /* and now, to test toString() : */ u = s.toString(); /* to test getResultBuffer(), getResultLength() : */ /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */ System.out.print(u); } break; } } } if (ch < 0) break; System.out.print((char) ch); } } catch (IOException e) { System.out.println("error reading " + args[i]); break; } } catch (FileNotFoundException e) { System.out.println("file " + args[i] + " not found"); break; } }
// 輸入文章的ID回傳一個tokenize過的ArrayList public static ArrayList<String> tokenize(int file) { Scanner fileIn = null; ArrayList<String> words = new ArrayList<String>(); try { fileIn = new Scanner(new FileInputStream("IRTM/" + file + ".txt")); } catch (FileNotFoundException e) { System.out.println("File not found."); System.exit(0); } while (fileIn.hasNext()) { Stemmer s = new Stemmer(); String token = fileIn.next().toLowerCase().replaceAll("[^a-zA-Z0-9]", ""); // 將非字母與數字的char清除 // 濾掉stop word boolean isStop = false; for (int i = 0; i < stopList.size(); i++) { if (token.equals(stopList.get(i)) || token.equals("")) { isStop = true; break; } } if (!isStop) { // 用stemmer將字stem for (int k = 0; k < token.length(); k++) { s.add(token.charAt(k)); } s.stem(); // 在濾一次stop word isStop = false; for (int i = 0; i < stopList.size(); i++) { if (s.toString().equals(stopList.get(i)) || s.toString().equals("")) { isStop = true; break; } } if (!isStop) { words.add(s.toString()); } } } fileIn.close(); // readStop.close(); return words; }