コード例 #1
0
  public String getProcessedSectionText() {
    String sectionText =
        Utils.HTML2Text(
            Utils.addPunctuationHTML(
                Utils.replaceCLTDelimiter(modifyAPIElementText(section.getHTML()))));

    return sectionText;
  }
コード例 #2
0
 private void extractVocabulary(String directoryPath, int classID) throws IOException {
   File dir = new File(directoryPath);
   File[] directoryListing = dir.listFiles();
   numDocs += directoryListing.length;
   numDocsOfEachClass[classID] = directoryListing.length;
   TreeMap<String, Integer> thisClassVocab = new TreeMap<String, Integer>();
   if (directoryListing != null) {
     for (File child : directoryListing) {
       System.out.print(".");
       FileInputStream fis = new FileInputStream(child);
       BufferedReader br = new BufferedReader(new InputStreamReader(fis));
       String wholeFile = "";
       String line = "";
       while (line != null) {
         line = br.readLine();
         wholeFile += line + " ";
       }
       for (String term : Utils.tokenizeString(wholeFile)) {
         if (!term.matches(".*\\d+.*")) {
           if (!vocabulary.contains(term)) vocabulary.add(term);
           if (!thisClassVocab.containsKey(term)) {
             thisClassVocab.put(term, 1);
           } else if (thisClassVocab.containsKey(term)) {
             int temp = thisClassVocab.get(term) + 1;
             thisClassVocab.put(term, new Integer(temp));
           }
         }
       }
       br.close();
     }
   }
   vocabOfClasses.add(thisClassVocab);
 }
コード例 #3
0
  public String apply(String newDocPath) throws IOException {

    FileInputStream fis = new FileInputStream(new File(newDocPath));
    BufferedReader br = new BufferedReader(new InputStreamReader(fis));

    String newDocContent = "";
    String line = "";
    while (line != null) {
      line = br.readLine();
      newDocContent += line + " ";
    }
    double[] score = new double[numClasses];

    List<String> newDocTokens = Utils.tokenizeString(newDocContent);
    TreeMap<String, Integer> termFreq = new TreeMap<String, Integer>();

    for (String str : newDocTokens) {
      if (vocabulary.contains(str)) {
        if (!termFreq.containsKey(str)) termFreq.put(str, 1);
        else {
          int temp = termFreq.get(str) + 1;
          termFreq.put(str, temp);
        }
      }
    }
    br.close();

    for (int c = 0; c < numClasses; c++) {
      score[c] = Math.log10(prior[c]);
      for (String t : termFreq.keySet()) {
        score[c] += Math.log10(condprob[vocabulary.indexOf(t)][c]);
      }
    }

    // return class id
    return classNames[Utils.argmax(score)];
  }
コード例 #4
0
  public List<CoreMap> getAllSentences(boolean parse) {
    String sectionText = getProcessedSectionText();

    List<CoreMap> sentences = Utils.getSentences(sectionText, parse);
    return sentences;
  }