public weka.classifiers.Classifier getClassifier() throws Exception { StringToWordVector stwv = new StringToWordVector(); stwv.setTFTransform(hasParam(Constant.RUNTIME_PARAMS.USE_TFIDF)); stwv.setIDFTransform(hasParam(Constant.RUNTIME_PARAMS.USE_TFIDF)); stwv.setLowerCaseTokens(hasParam(Constant.RUNTIME_PARAMS.CONV_LOWERCASE)); stwv.setUseStoplist(hasParam(Constant.RUNTIME_PARAMS.REM_STOP_WORDS)); stwv.setOutputWordCounts(hasParam(Constant.RUNTIME_PARAMS.USE_WORD_FREQ)); if (hasParam(Constant.RUNTIME_PARAMS.TRAIN_AND_TEST)) stwv.setInputFormat(getTrainData()); if (hasParam(Constant.RUNTIME_PARAMS.USE_BIGRAM)) { NGramTokenizer tokenizer = new NGramTokenizer(); tokenizer.setNGramMinSize(2); stwv.setTokenizer(tokenizer); } else if (hasParam(Constant.RUNTIME_PARAMS.USE_TRIGRAM)) { NGramTokenizer tokenizer = new NGramTokenizer(); tokenizer.setNGramMinSize(3); stwv.setTokenizer(tokenizer); } if (hasParam(Constant.RUNTIME_PARAMS.USE_STEMMER)) { SnowballStemmer stemmer = new SnowballStemmer("porter"); stwv.setStemmer(stemmer); } Logistic l = new Logistic(); FilteredClassifier cls = new FilteredClassifier(); cls.setClassifier(l); cls.setFilter(stwv); if (hasParam(Constant.RUNTIME_PARAMS.TRAIN_AND_TEST)) cls.buildClassifier(getTrainData()); return cls; }
/** * Make data sets and train and test model * * @param filePathTrain * @param filePathTest * @param gram */ public static void makeDataSet(String filePathTrain, String filePathTest, int gram) { TextDirectoryLoader loader = new TextDirectoryLoader(); try { loader.setDirectory(new File(filePathTrain)); Instances dataRawTrain = loader.getDataSet(); loader.setDirectory(new File(filePathTest)); Instances dataRawTest = loader.getDataSet(); StringToWordVector filter = new StringToWordVector(); NGramTokenizer tokeniser = new NGramTokenizer(); tokeniser.setNGramMinSize(gram); tokeniser.setNGramMaxSize(gram); filter.setTokenizer(tokeniser); filter.setInputFormat(dataRawTrain); Instances train = Filter.useFilter(dataRawTrain, filter); // filter.setInputFormat(dataRawTest); Instances test = Filter.useFilter(dataRawTest, filter); /** * * * * <p>Replace this function each time to change models */ trainModelNaiveBayes(train, test); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }