/** * Main method. * * @param args should contain the name of an input file. */ public static void main(String[] args) { if (args.length > 0) { try { TextDirectoryLoader loader = new TextDirectoryLoader(); loader.setOptions(args); // System.out.println(loader.getDataSet()); Instances structure = loader.getStructure(); System.out.println(structure); Instance temp; do { temp = loader.getNextInstance(structure); if (temp != null) { System.out.println(temp); } } while (temp != null); } catch (Exception e) { e.printStackTrace(); } } else { System.err.println("\nUsage:\n" + "\tTextDirectoryLoader [options]\n" + "\n" + "Options:\n"); Enumeration enm = ((OptionHandler) new TextDirectoryLoader()).listOptions(); while (enm.hasMoreElements()) { Option option = (Option) enm.nextElement(); System.err.println(option.synopsis()); System.err.println(option.description()); } System.err.println(); } }
/** * Make data sets and train and test model * * @param filePathTrain * @param filePathTest * @param gram */ public static void makeDataSet(String filePathTrain, String filePathTest, int gram) { TextDirectoryLoader loader = new TextDirectoryLoader(); try { loader.setDirectory(new File(filePathTrain)); Instances dataRawTrain = loader.getDataSet(); loader.setDirectory(new File(filePathTest)); Instances dataRawTest = loader.getDataSet(); StringToWordVector filter = new StringToWordVector(); NGramTokenizer tokeniser = new NGramTokenizer(); tokeniser.setNGramMinSize(gram); tokeniser.setNGramMaxSize(gram); filter.setTokenizer(tokeniser); filter.setInputFormat(dataRawTrain); Instances train = Filter.useFilter(dataRawTrain, filter); // filter.setInputFormat(dataRawTest); Instances test = Filter.useFilter(dataRawTest, filter); /** * * * * <p>Replace this function each time to change models */ trainModelNaiveBayes(train, test); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public void processFolder(File folder) throws Exception { if (!folder.isDirectory()) { // manipulate file here String fileName = folder.getName(); System.out.println(fileName); // String extension = getFileExtension(fileName); testdata = new Instances(new BufferedReader(new FileReader(folder))); if (!fileName.startsWith(".") && (fileName.contains(".csv") || fileName.contains(".xls"))) { CSVLoader loader = new CSVLoader(); loader.setSource(new File(folder.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".txt")) { TextDirectoryLoader loader = new TextDirectoryLoader(); System.out.println("About to load text file " + fileName); System.out.println("Name of path " + folder.getAbsolutePath()); loader.setSource(folder); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".json")) { JSONLoader loader = new JSONLoader(); loader.setSource(new File(folder.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".xrff")) { XRFFLoader loader = new XRFFLoader(); loader.setSource(new File(folder.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".arff")) { traindata = new Instances(new BufferedReader(new FileReader(folder.getAbsolutePath()))); testdata = new Instances(new BufferedReader(new FileReader(folder))); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".mdf")) { DatabaseConnection loader = new DatabaseConnection(); loader.connectToDatabase(); InstanceQuery query = new InstanceQuery(); query.setUsername("lamogha"); query.setPassword("l@mmyPHD"); query.setQuery("select * from customers"); // You can declare that your data set is sparse // query.setSparseData(true); Instances data = query.retrieveInstances(); System.out.println(data.toSummaryString()); this.chooseClassifier(); } } else { for (final File fileEntry : folder.listFiles()) { if (fileEntry.isDirectory()) { this.processFolder(fileEntry); } else { // manipulate file here String fileName = fileEntry.getName(); System.out.println(fileName); if (!fileName.startsWith(".") && (fileName.contains(".csv") || fileName.contains(".xls"))) { CSVLoader loader = new CSVLoader(); loader.setSource(new File(fileEntry.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".txt")) { TextDirectoryLoader loader = new TextDirectoryLoader(); System.out.println("About to load text file " + fileName); System.out.println("Name of path " + fileEntry.getAbsolutePath()); loader.setSource(folder); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".json")) { JSONLoader loader = new JSONLoader(); loader.setSource(new File(fileEntry.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".xrff")) { XRFFLoader loader = new XRFFLoader(); loader.setSource(new File(fileEntry.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".")) { traindata = new Instances(new BufferedReader(new FileReader(fileEntry.getAbsolutePath()))); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".mdf")) { DatabaseConnection loader = new DatabaseConnection(); loader.connectToDatabase(); InstanceQuery query = new InstanceQuery(); query.setUsername("lamogha"); query.setPassword("l@mmyPHD"); query.setQuery("select * from customers"); // You can declare that your data set is sparse // query.setSparseData(true); Instances data = query.retrieveInstances(); System.out.println(data.toSummaryString()); this.chooseClassifier(); } } } // System.exit(0); } }