Exemple #1
0
  private Set<String> readDict(String filename) {
    Set<String> a = Generics.newHashSet();

    // System.err.println("XM:::readDict(filename: " + filename + ")");
    System.err.println("Loading affix dictionary from " + filename);
    try {
      /*
      if(filename.endsWith("in.as") ||filename.endsWith("in.city") ){
      	aDetectorReader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "Big5_HKSCS"));
      }else{ aDetectorReader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "GB18030"));
      }
      */
      InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename);
      BufferedReader aDetectorReader = new BufferedReader(new InputStreamReader(is, "UTF-8"));

      String aDetectorLine;

      // System.err.println("DEBUG: in affDict readDict");
      while ((aDetectorLine = aDetectorReader.readLine()) != null) {
        // System.err.println("DEBUG: affDict: "+filename+" "+aDetectorLine);
        a.add(aDetectorLine);
      }
      is.close();
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
    return a;
  }
  private static Set<String> readDict(String filename, boolean normalize) {
    Set<String> word = Generics.newHashSet();

    logger.info(
        "Loading " + (normalize ? "normalized" : "unnormalized") + " dictionary from " + filename);

    try {
      InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename);
      BufferedReader wordDetectorReader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
      int i = 0;
      for (String wordDetectorLine; (wordDetectorLine = wordDetectorReader.readLine()) != null; ) {
        i++;
        // String[] fields = wordDetectorLine.split("	");
        // logger.debug("DEBUG: "+filename+" "+wordDetectorLine);
        int origLeng = wordDetectorLine.length();
        wordDetectorLine = wordDetectorLine.trim();
        int newLeng = wordDetectorLine.length();
        if (newLeng != origLeng) {
          EncodingPrintWriter.err.println(
              "Line "
                  + i
                  + " of "
                  + filename
                  + " has leading/trailing whitespace: |"
                  + wordDetectorLine
                  + "|",
              "UTF-8");
        }
        if (newLeng == 0) {
          EncodingPrintWriter.err.println("Line " + i + " of " + filename + " is empty", "UTF-8");
        } else {
          if (normalize) {
            wordDetectorLine =
                ChineseUtils.normalize(
                    wordDetectorLine,
                    ChineseUtils.ASCII,
                    ChineseUtils.ASCII,
                    ChineseUtils.NORMALIZE);
          }
          word.add(wordDetectorLine);
        }
      }
      is.close();
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
    return word;
  }