private Set<String> readDict(String filename) { Set<String> a = Generics.newHashSet(); // System.err.println("XM:::readDict(filename: " + filename + ")"); System.err.println("Loading affix dictionary from " + filename); try { /* if(filename.endsWith("in.as") ||filename.endsWith("in.city") ){ aDetectorReader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "Big5_HKSCS")); }else{ aDetectorReader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "GB18030")); } */ InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename); BufferedReader aDetectorReader = new BufferedReader(new InputStreamReader(is, "UTF-8")); String aDetectorLine; // System.err.println("DEBUG: in affDict readDict"); while ((aDetectorLine = aDetectorReader.readLine()) != null) { // System.err.println("DEBUG: affDict: "+filename+" "+aDetectorLine); a.add(aDetectorLine); } is.close(); } catch (IOException e) { throw new RuntimeIOException(e); } return a; }
private static Set<String> readDict(String filename, boolean normalize) { Set<String> word = Generics.newHashSet(); logger.info( "Loading " + (normalize ? "normalized" : "unnormalized") + " dictionary from " + filename); try { InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename); BufferedReader wordDetectorReader = new BufferedReader(new InputStreamReader(is, "UTF-8")); int i = 0; for (String wordDetectorLine; (wordDetectorLine = wordDetectorReader.readLine()) != null; ) { i++; // String[] fields = wordDetectorLine.split(" "); // logger.debug("DEBUG: "+filename+" "+wordDetectorLine); int origLeng = wordDetectorLine.length(); wordDetectorLine = wordDetectorLine.trim(); int newLeng = wordDetectorLine.length(); if (newLeng != origLeng) { EncodingPrintWriter.err.println( "Line " + i + " of " + filename + " has leading/trailing whitespace: |" + wordDetectorLine + "|", "UTF-8"); } if (newLeng == 0) { EncodingPrintWriter.err.println("Line " + i + " of " + filename + " is empty", "UTF-8"); } else { if (normalize) { wordDetectorLine = ChineseUtils.normalize( wordDetectorLine, ChineseUtils.ASCII, ChineseUtils.ASCII, ChineseUtils.NORMALIZE); } word.add(wordDetectorLine); } } is.close(); } catch (IOException e) { throw new RuntimeIOException(e); } return word; }