Exemplos de StringToWordVector.setInputFormat em Java, exemplos de weka.filters.unsupervised.attribute.StringToWordVector.setInputFormat em Java

Exemplo n.º 1

0

Exibir arquivo

Arquivo: LogisticRegression.java Projeto: emrekgn/sentiment-analyzer

  public weka.classifiers.Classifier getClassifier() throws Exception {

    StringToWordVector stwv = new StringToWordVector();
    stwv.setTFTransform(hasParam(Constant.RUNTIME_PARAMS.USE_TFIDF));
    stwv.setIDFTransform(hasParam(Constant.RUNTIME_PARAMS.USE_TFIDF));
    stwv.setLowerCaseTokens(hasParam(Constant.RUNTIME_PARAMS.CONV_LOWERCASE));
    stwv.setUseStoplist(hasParam(Constant.RUNTIME_PARAMS.REM_STOP_WORDS));
    stwv.setOutputWordCounts(hasParam(Constant.RUNTIME_PARAMS.USE_WORD_FREQ));
    if (hasParam(Constant.RUNTIME_PARAMS.TRAIN_AND_TEST)) stwv.setInputFormat(getTrainData());
    if (hasParam(Constant.RUNTIME_PARAMS.USE_BIGRAM)) {
      NGramTokenizer tokenizer = new NGramTokenizer();
      tokenizer.setNGramMinSize(2);
      stwv.setTokenizer(tokenizer);
    } else if (hasParam(Constant.RUNTIME_PARAMS.USE_TRIGRAM)) {
      NGramTokenizer tokenizer = new NGramTokenizer();
      tokenizer.setNGramMinSize(3);
      stwv.setTokenizer(tokenizer);
    }
    if (hasParam(Constant.RUNTIME_PARAMS.USE_STEMMER)) {
      SnowballStemmer stemmer = new SnowballStemmer("porter");
      stwv.setStemmer(stemmer);
    }

    Logistic l = new Logistic();

    FilteredClassifier cls = new FilteredClassifier();
    cls.setClassifier(l);
    cls.setFilter(stwv);
    if (hasParam(Constant.RUNTIME_PARAMS.TRAIN_AND_TEST)) cls.buildClassifier(getTrainData());

    return cls;
  }

Exemplo n.º 2

0

Exibir arquivo

Arquivo: LogisticClassifier.java Projeto: juansixto/BinaryChainedClassifiers

  public void filterData() throws Exception {
    Instances data = source.getDataSet();
    StringToWordVector stv = new StringToWordVector();
    stv.setOptions(
        weka.core.Utils.splitOptions(
            "-R first-last -W 1000 "
                + "-prune-rate -1.0 -N 0 "
                + "-stemmer weka.core.stemmers.NullStemmer -M 1 "
                + "-tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters  \\\" \\r\\n\\t.,;:\\\'\\\"()?!\""));

    stv.setInputFormat(data);
    Instances newdata = Filter.useFilter(data, stv);
    this.inst = newdata;
    this.inst.setClassIndex(0);
  }

Exemplo n.º 3

0

Exibir arquivo

Arquivo: LogisticClassifier.java Projeto: juansixto/BinaryChainedClassifiers

  @Override
  public void crossValidation(String traindata) throws Exception {
    DataSource ds = new DataSource(traindata);
    Instances instances = ds.getDataSet();
    StringToWordVector stv = new StringToWordVector();
    stv.setOptions(
        weka.core.Utils.splitOptions(
            "-R first-last -W 1000 "
                + "-prune-rate -1.0 -N 0 "
                + "-stemmer weka.core.stemmers.NullStemmer -M 1 "
                + "-tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters  \\\" \\r\\n\\t.,;:\\\'\\\"()?!\""));

    stv.setInputFormat(instances);
    instances = Filter.useFilter(instances, stv);
    instances.setClassIndex(0);
    Evaluation eval = new Evaluation(instances);
    eval.crossValidateModel(this.classifier, instances, 10, new Random(1));
    System.out.println(eval.toSummaryString());
    System.out.println(eval.toMatrixString());
  }

Exemplo n.º 4

0

Exibir arquivo

Arquivo: NLPAssignmentFinal.java Projeto: borah-hemanga/NLP-BlogAuthorshipDetection

  /**
   * Make data sets and train and test model
   *
   * @param filePathTrain
   * @param filePathTest
   * @param gram
   */
  public static void makeDataSet(String filePathTrain, String filePathTest, int gram) {

    TextDirectoryLoader loader = new TextDirectoryLoader();
    try {

      loader.setDirectory(new File(filePathTrain));
      Instances dataRawTrain = loader.getDataSet();

      loader.setDirectory(new File(filePathTest));
      Instances dataRawTest = loader.getDataSet();

      StringToWordVector filter = new StringToWordVector();
      NGramTokenizer tokeniser = new NGramTokenizer();

      tokeniser.setNGramMinSize(gram);
      tokeniser.setNGramMaxSize(gram);

      filter.setTokenizer(tokeniser);

      filter.setInputFormat(dataRawTrain);

      Instances train = Filter.useFilter(dataRawTrain, filter);

      // filter.setInputFormat(dataRawTest);

      Instances test = Filter.useFilter(dataRawTest, filter);

      /**
       * *
       *
       * <p>Replace this function each time to change models
       */
      trainModelNaiveBayes(train, test);
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }

Exemplo n.º 5

0

Exibir arquivo

Arquivo: C4_5TweetTopicCategorization.java Projeto: kunickiaj/twitter_research

  public static void main(String[] args) {

    if (args.length < 1) {
      System.out.println("usage: C4_5TweetTopicCategorization <root_path>");
      System.exit(-1);
    }

    String rootPath = args[0];
    File dataFolder = new File(rootPath + "/data");
    String resultFolderPath = rootPath + "/results/C4_5/";

    CrisisMailer crisisMailer = CrisisMailer.getCrisisMailer();
    Logger logger = Logger.getLogger(C4_5TweetTopicCategorization.class);
    PropertyConfigurator.configure(Constants.LOG4J_PROPERTIES_FILE_PATH);

    File resultFolder = new File(resultFolderPath);
    if (!resultFolder.exists()) resultFolder.mkdir();

    CSVLoader csvLoader = new CSVLoader();

    try {
      for (File dataSetName : dataFolder.listFiles()) {

        Instances data = null;
        try {
          csvLoader.setSource(dataSetName);
          csvLoader.setStringAttributes("2");
          data = csvLoader.getDataSet();
        } catch (IOException ioe) {
          logger.error(ioe);
          crisisMailer.sendEmailAlert(ioe);
          System.exit(-1);
        }

        data.setClassIndex(data.numAttributes() - 1);
        data.deleteWithMissingClass();

        Instances vectorizedData = null;
        StringToWordVector stringToWordVectorFilter = new StringToWordVector();
        try {
          stringToWordVectorFilter.setInputFormat(data);
          stringToWordVectorFilter.setAttributeIndices("2");
          stringToWordVectorFilter.setIDFTransform(true);
          stringToWordVectorFilter.setLowerCaseTokens(true);
          stringToWordVectorFilter.setOutputWordCounts(false);
          stringToWordVectorFilter.setUseStoplist(true);

          vectorizedData = Filter.useFilter(data, stringToWordVectorFilter);
          vectorizedData.deleteAttributeAt(0);
          // System.out.println(vectorizedData);
        } catch (Exception exception) {
          logger.error(exception);
          crisisMailer.sendEmailAlert(exception);
          System.exit(-1);
        }

        J48 j48Classifier = new J48();

        /*
        FilteredClassifier filteredClassifier = new FilteredClassifier();
        filteredClassifier.setFilter(stringToWordVectorFilter);
        filteredClassifier.setClassifier(j48Classifier);
        */

        try {
          Evaluation eval = new Evaluation(vectorizedData);
          eval.crossValidateModel(
              j48Classifier, vectorizedData, 5, new Random(System.currentTimeMillis()));

          FileOutputStream resultOutputStream =
              new FileOutputStream(new File(resultFolderPath + dataSetName.getName()));

          resultOutputStream.write(eval.toSummaryString("=== Summary ===", false).getBytes());
          resultOutputStream.write(eval.toMatrixString().getBytes());
          resultOutputStream.write(eval.toClassDetailsString().getBytes());
          resultOutputStream.close();

        } catch (Exception exception) {
          logger.error(exception);
          crisisMailer.sendEmailAlert(exception);
          System.exit(-1);
        }
      }
    } catch (Exception exception) {
      logger.error(exception);
      crisisMailer.sendEmailAlert(exception);
      System.out.println(-1);
    }
  }

Exemplo n.º 6

0

Exibir arquivo

Arquivo: MessageClassify.java Projeto: yuyefeiwu20160615/NERDisambiguationBySVM

 /**
  * 文本分类要特别一点，因为在使用StringToWordVector对象计算文本中词项(attribute)权重的时候需要用到全局变量，比如DF，所以这里需要批量处理
  * 在weka中要注意有些机器学习算法是批处理有些不是
  */
 public void finishBatch() throws Exception {
   filter.setIDFTransform(true);
   filter.setInputFormat(instances);
   Instances filteredData = Filter.useFilter(instances, filter); // 这才真正产生符合weka算法输入格式的数据集
   classifier.buildClassifier(filteredData); // 真正的训练分类器
 }