public static void readWords(String fileOrDirectory, ReadListener l) throws IOException { File file = new File(fileOrDirectory); File[] files = new File[] {file}; if (file.isDirectory()) { files = file.listFiles(); } for (int i = 0; i < files.length; i++) { if (!l.onFileBegin(files[i].getName())) { continue; } BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(files[i]))); String word; while ((word = in.readLine()) != null) { l.onWord(word); } l.onFileEnd(files[i].getName()); in.close(); } }
public static void readWords(String fileOrDirectory, ReadListener l, String charsetName) throws IOException { File file; if (fileOrDirectory.startsWith("classpath:")) { String name = fileOrDirectory.substring("classpath:".length()); URL url = FileWordsReader.class.getClassLoader().getResource(name); if (url == null) { throw new FileNotFoundException("file \"" + name + "\" not found in classpath!"); } file = new File(getUrlPath(url)); } else { file = new File(fileOrDirectory); if (!file.exists()) { throw new FileNotFoundException("file \"" + fileOrDirectory + "\" not found!"); } } ArrayList /*<File>*/ dirs = new ArrayList /*<File>*/(); LinkedList /*<File>*/ dics = new LinkedList /*<File>*/(); String dir; if (file.isDirectory()) { dirs.add(file); dir = file.getAbsolutePath(); } else { dics.add(file); dir = file.getParentFile().getAbsolutePath(); } int index = 0; while (index < dirs.size()) { File cur = (File) dirs.get(index++); File[] files = cur.listFiles(); for (int i = 0; i < files.length; i++) { File f = files[i]; if (f.isDirectory()) { dirs.add(f); } else { dics.add(f); } } } for (Iterator iter = dics.iterator(); iter.hasNext(); ) { File f = (File) iter.next(); String name = f.getAbsolutePath().substring(dir.length() + 1); name = name.replace('\\', '/'); if (!l.onFileBegin(name)) { continue; } BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), charsetName)); String word; boolean firstInDic = true; while ((word = in.readLine()) != null) { if (firstInDic) { firstInDic = false; // ref:http://www.w3.org/International/questions/qa-utf8-bom // ZERO WIDTH NO-BREAK SPACE // notepad将文件保存为unitcode或utf-8时会在文件开头保存bom字符串 // notepad根据是否有bom来识别该文件是否是utf-8编码存储的。 // 庖丁字典需要将这个字符从词典中去掉 if (word.length() > 0 && CharSet.isBom(word.charAt(0))) { word = word.substring(1); } } l.onWord(word); } l.onFileEnd(name); in.close(); } }