/** * Make data sets and train and test model * * @param filePathTrain * @param filePathTest * @param gram */ public static void makeDataSet(String filePathTrain, String filePathTest, int gram) { TextDirectoryLoader loader = new TextDirectoryLoader(); try { loader.setDirectory(new File(filePathTrain)); Instances dataRawTrain = loader.getDataSet(); loader.setDirectory(new File(filePathTest)); Instances dataRawTest = loader.getDataSet(); StringToWordVector filter = new StringToWordVector(); NGramTokenizer tokeniser = new NGramTokenizer(); tokeniser.setNGramMinSize(gram); tokeniser.setNGramMaxSize(gram); filter.setTokenizer(tokeniser); filter.setInputFormat(dataRawTrain); Instances train = Filter.useFilter(dataRawTrain, filter); // filter.setInputFormat(dataRawTest); Instances test = Filter.useFilter(dataRawTest, filter); /** * * * * <p>Replace this function each time to change models */ trainModelNaiveBayes(train, test); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
/** * Parses a given list of options. * * <p> * <!-- options-start --> * Valid options are: * * <p> * * <pre> -D * Enables debug output. * (default: off)</pre> * * <pre> -F * Stores the filename in an additional attribute. * (default: off)</pre> * * <pre> -dir <directory> * The directory to work on. * (default: current directory)</pre> * * <pre> -charset <charset name> * The character set to use, e.g UTF-8. * (default: use the default character set)</pre> * * <pre> -R * Retain all string attribute values when reading incrementally.</pre> * * <!-- options-end --> * * @param options the options * @throws Exception if options cannot be set */ public void setOptions(String[] options) throws Exception { setDebug(Utils.getFlag("D", options)); setOutputFilename(Utils.getFlag("F", options)); setDirectory(new File(Utils.getOption("dir", options))); String charSet = Utils.getOption("charset", options); m_charSet = ""; if (charSet.length() > 0) { m_charSet = charSet; } setRetainStringValues(Utils.getFlag('R', options)); }