Ejemplo n.º 1
0
  /**
   * Main method.
   *
   * @param args should contain the name of an input file.
   */
  public static void main(String[] args) {
    if (args.length > 0) {
      try {
        TextDirectoryLoader loader = new TextDirectoryLoader();
        loader.setOptions(args);
        // System.out.println(loader.getDataSet());
        Instances structure = loader.getStructure();
        System.out.println(structure);
        Instance temp;
        do {
          temp = loader.getNextInstance(structure);
          if (temp != null) {
            System.out.println(temp);
          }
        } while (temp != null);
      } catch (Exception e) {
        e.printStackTrace();
      }
    } else {
      System.err.println("\nUsage:\n" + "\tTextDirectoryLoader [options]\n" + "\n" + "Options:\n");

      Enumeration enm = ((OptionHandler) new TextDirectoryLoader()).listOptions();
      while (enm.hasMoreElements()) {
        Option option = (Option) enm.nextElement();
        System.err.println(option.synopsis());
        System.err.println(option.description());
      }

      System.err.println();
    }
  }
  /**
   * Make data sets and train and test model
   *
   * @param filePathTrain
   * @param filePathTest
   * @param gram
   */
  public static void makeDataSet(String filePathTrain, String filePathTest, int gram) {

    TextDirectoryLoader loader = new TextDirectoryLoader();
    try {

      loader.setDirectory(new File(filePathTrain));
      Instances dataRawTrain = loader.getDataSet();

      loader.setDirectory(new File(filePathTest));
      Instances dataRawTest = loader.getDataSet();

      StringToWordVector filter = new StringToWordVector();
      NGramTokenizer tokeniser = new NGramTokenizer();

      tokeniser.setNGramMinSize(gram);
      tokeniser.setNGramMaxSize(gram);

      filter.setTokenizer(tokeniser);

      filter.setInputFormat(dataRawTrain);

      Instances train = Filter.useFilter(dataRawTrain, filter);

      // filter.setInputFormat(dataRawTest);

      Instances test = Filter.useFilter(dataRawTest, filter);

      /**
       * *
       *
       * <p>Replace this function each time to change models
       */
      trainModelNaiveBayes(train, test);
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
  public void processFolder(File folder) throws Exception {

    if (!folder.isDirectory()) {
      // manipulate file here
      String fileName = folder.getName();
      System.out.println(fileName);
      // String extension = getFileExtension(fileName);
      testdata = new Instances(new BufferedReader(new FileReader(folder)));
      if (!fileName.startsWith(".") && (fileName.contains(".csv") || fileName.contains(".xls"))) {
        CSVLoader loader = new CSVLoader();
        loader.setSource(new File(folder.getAbsolutePath()));
        traindata = loader.getDataSet();
        System.out.println(traindata.toSummaryString());
        this.chooseClassifier();
      } else if (!fileName.startsWith(".") && fileName.contains(".txt")) {

        TextDirectoryLoader loader = new TextDirectoryLoader();
        System.out.println("About to load text file " + fileName);
        System.out.println("Name of path " + folder.getAbsolutePath());
        loader.setSource(folder);
        traindata = loader.getDataSet();
        System.out.println(traindata.toSummaryString());
        this.chooseClassifier();

      } else if (!fileName.startsWith(".") && fileName.contains(".json")) {
        JSONLoader loader = new JSONLoader();
        loader.setSource(new File(folder.getAbsolutePath()));
        traindata = loader.getDataSet();
        System.out.println(traindata.toSummaryString());
        this.chooseClassifier();
      } else if (!fileName.startsWith(".") && fileName.contains(".xrff")) {
        XRFFLoader loader = new XRFFLoader();
        loader.setSource(new File(folder.getAbsolutePath()));
        traindata = loader.getDataSet();
        System.out.println(traindata.toSummaryString());
        this.chooseClassifier();
      } else if (!fileName.startsWith(".") && fileName.contains(".arff")) {
        traindata = new Instances(new BufferedReader(new FileReader(folder.getAbsolutePath())));
        testdata = new Instances(new BufferedReader(new FileReader(folder)));
        System.out.println(traindata.toSummaryString());
        this.chooseClassifier();
      } else if (!fileName.startsWith(".") && fileName.contains(".mdf")) {
        DatabaseConnection loader = new DatabaseConnection();
        loader.connectToDatabase();
        InstanceQuery query = new InstanceQuery();
        query.setUsername("lamogha");
        query.setPassword("[email protected]");
        query.setQuery("select * from customers");
        // You can declare that your data set is sparse
        // query.setSparseData(true);
        Instances data = query.retrieveInstances();
        System.out.println(data.toSummaryString());
        this.chooseClassifier();
      }
    } else {

      for (final File fileEntry : folder.listFiles()) {
        if (fileEntry.isDirectory()) {
          this.processFolder(fileEntry);
        } else {
          // manipulate file here
          String fileName = fileEntry.getName();
          System.out.println(fileName);

          if (!fileName.startsWith(".")
              && (fileName.contains(".csv") || fileName.contains(".xls"))) {
            CSVLoader loader = new CSVLoader();
            loader.setSource(new File(fileEntry.getAbsolutePath()));
            traindata = loader.getDataSet();
            System.out.println(traindata.toSummaryString());
            this.chooseClassifier();
          } else if (!fileName.startsWith(".") && fileName.contains(".txt")) {

            TextDirectoryLoader loader = new TextDirectoryLoader();
            System.out.println("About to load text file " + fileName);
            System.out.println("Name of path " + fileEntry.getAbsolutePath());
            loader.setSource(folder);
            traindata = loader.getDataSet();
            System.out.println(traindata.toSummaryString());
            this.chooseClassifier();

          } else if (!fileName.startsWith(".") && fileName.contains(".json")) {
            JSONLoader loader = new JSONLoader();
            loader.setSource(new File(fileEntry.getAbsolutePath()));
            traindata = loader.getDataSet();
            System.out.println(traindata.toSummaryString());
            this.chooseClassifier();
          } else if (!fileName.startsWith(".") && fileName.contains(".xrff")) {
            XRFFLoader loader = new XRFFLoader();
            loader.setSource(new File(fileEntry.getAbsolutePath()));
            traindata = loader.getDataSet();
            System.out.println(traindata.toSummaryString());
            this.chooseClassifier();
          } else if (!fileName.startsWith(".")) {
            traindata =
                new Instances(new BufferedReader(new FileReader(fileEntry.getAbsolutePath())));
            System.out.println(traindata.toSummaryString());
            this.chooseClassifier();
          } else if (!fileName.startsWith(".") && fileName.contains(".mdf")) {
            DatabaseConnection loader = new DatabaseConnection();
            loader.connectToDatabase();
            InstanceQuery query = new InstanceQuery();
            query.setUsername("lamogha");
            query.setPassword("[email protected]");
            query.setQuery("select * from customers");
            // You can declare that your data set is sparse
            // query.setSparseData(true);
            Instances data = query.retrieveInstances();
            System.out.println(data.toSummaryString());
            this.chooseClassifier();
          }
        }
      }
      // System.exit(0);
    }
  }