private void extractVocabulary(String directoryPath, int classID) throws IOException { File dir = new File(directoryPath); File[] directoryListing = dir.listFiles(); numDocs += directoryListing.length; numDocsOfEachClass[classID] = directoryListing.length; TreeMap<String, Integer> thisClassVocab = new TreeMap<String, Integer>(); if (directoryListing != null) { for (File child : directoryListing) { System.out.print("."); FileInputStream fis = new FileInputStream(child); BufferedReader br = new BufferedReader(new InputStreamReader(fis)); String wholeFile = ""; String line = ""; while (line != null) { line = br.readLine(); wholeFile += line + " "; } for (String term : Utils.tokenizeString(wholeFile)) { if (!term.matches(".*\\d+.*")) { if (!vocabulary.contains(term)) vocabulary.add(term); if (!thisClassVocab.containsKey(term)) { thisClassVocab.put(term, 1); } else if (thisClassVocab.containsKey(term)) { int temp = thisClassVocab.get(term) + 1; thisClassVocab.put(term, new Integer(temp)); } } } br.close(); } } vocabOfClasses.add(thisClassVocab); }
public String apply(String newDocPath) throws IOException { FileInputStream fis = new FileInputStream(new File(newDocPath)); BufferedReader br = new BufferedReader(new InputStreamReader(fis)); String newDocContent = ""; String line = ""; while (line != null) { line = br.readLine(); newDocContent += line + " "; } double[] score = new double[numClasses]; List<String> newDocTokens = Utils.tokenizeString(newDocContent); TreeMap<String, Integer> termFreq = new TreeMap<String, Integer>(); for (String str : newDocTokens) { if (vocabulary.contains(str)) { if (!termFreq.containsKey(str)) termFreq.put(str, 1); else { int temp = termFreq.get(str) + 1; termFreq.put(str, temp); } } } br.close(); for (int c = 0; c < numClasses; c++) { score[c] = Math.log10(prior[c]); for (String t : termFreq.keySet()) { score[c] += Math.log10(condprob[vocabulary.indexOf(t)][c]); } } // return class id return classNames[Utils.argmax(score)]; }