コード例 #1
0
  /**
   * opennlp part of speech tagging
   *
   * @param tokens
   * @return
   * @return
   * @throws IOException
   */
  public static List<String> taggerAndStemmer(String inputFile) throws IOException {

    String tags[] = null;
    String[] tokens = tokenizer(inputFile);
    List<String> token_tag = new ArrayList<String>();
    List<String> stemmedwords = new ArrayList<String>();
    PlingStemmer stemmer = new PlingStemmer();

    try (InputStream posModelStream = new FileInputStream("openNLPmodels\\en-pos-maxent.bin");
        InputStream chunkerStream = new FileInputStream("openNLPmodels\\en-chunker.bin"); ) {
      POSModel modelTagger = new POSModel(posModelStream);
      POSTaggerME tagger = new POSTaggerME(modelTagger);
      tags = tagger.tag(tokens);

      for (int i = 0; i < tags.length; i++) {

        String w = tokens[i].toLowerCase(); // lowercase phrase
        w = stemmer.stem(w); // stemming phrase
        if (tags[i].equals("NNS")) token_tag.add(w + "/" + "NN");
        else token_tag.add(w + "/" + tags[i]);
      }

    } catch (IOException ex) {
      // Handle exceptions

    }
    return token_tag;
  }
コード例 #2
0
  public void run(String sentence) {
    POSTaggerME tagger = new POSTaggerME(getModel());
    String[] words = sentence.split("\\s+");
    String[] tags = tagger.tag(words);
    double[] probs = tagger.probs();

    for (int i = 0; i < tags.length; i++) {
      System.out.println(words[i] + " => " + tags[i] + " @ " + probs[i]);
    }
  }
コード例 #3
0
 @Override
 public void annotate(Annotation sentence) {
   POSTaggerME posTagger = new POSTaggerME(loadPOSTagger(sentence.getLanguage()));
   String[] tokens = sentence.tokens().stream().map(Object::toString).toArray(String[]::new);
   String[] tags = posTagger.tag(tokens);
   for (int i = 0; i < tokens.length; i++) {
     Annotation token = sentence.tokenAt(i);
     token.put(Types.PART_OF_SPEECH, POS.fromString(tags[i]));
   }
 }
コード例 #4
0
ファイル: OpenNLPUtil.java プロジェクト: NamyounKim/TextPrism
  @Override
  public Matrix parseContent(
      LinkedList docRow,
      int idCol,
      int targetCol,
      LinkedList<Integer> dateColsNum,
      LinkedList<Integer> otherColsNum) {

    Matrix returnMat = new Matrix();
    String inputStr = docRow.get(targetCol - 1).toString().toLowerCase();

    String[] inputStrArray = tokenizer.tokenize(inputStr);
    String[] tagArray = tagger.tag(inputStrArray);

    for (int i = 0; i < tagArray.length; i++) {
      String keywordRole = tagArray[i];
      String orgKeyword = inputStrArray[i];
      // System.out.println(stemmer.stem("agreed"));

      // check keyword_role is usable
      boolean roleFlag = AnalysisThread.roleFilter(keywordRole);
      if (roleFlag) {
        AnalysisThread.addToMatrix(keywordRole, orgKeyword, docRow, returnMat);
      }
    }

    return returnMat;
  }
コード例 #5
0
ファイル: PrecisionRun.java プロジェクト: daalft/PaliTagger
  public void run(String file, POSModel model) throws Exception {
    @SuppressWarnings("resource")
    BufferedReader br = new BufferedReader(new FileReader(new File(file)));
    String l = "";
    POSTaggerME tagger = new POSTaggerME(model);
    int correct = 0;
    int wrong = 0;

    while ((l = br.readLine()) != null) {
      String strip = l.replaceAll("_[A-Z]+", "");
      String[] tags1 = l.replaceAll("[^_\\s]+?_", "").split(" ");
      //			System.out.println(strip);
      //			System.out.println(tags);
      String[] tags2 = tagger.tag(strip.split(" "));
      String[] strips = strip.split(" ");
      if (tags2.length != tags1.length) {
        // something went wrong
        throw new Exception("Tag lists unequal size");
      }
      for (int i = 0; i < tags2.length; i++) {
        if (tags1[i].equals(tags2[i])) {

          correct++;
        } else {
          System.err.println(strip);
          System.err.print(strips[i] + " ");
          System.err.println(tags1[i] + " tagged as " + tags2[i]);
          wrong++;
        }
      }
    }
    br.close();
    System.out.println("Total: " + (correct + wrong));
    System.out.println("Correct: " + correct);
    System.out.println("Wrong: " + wrong);
    System.out.println("Precision: " + (((double) correct) / (correct + wrong)));
  }
コード例 #6
0
  public static String getPredicates(String s) {
    InputStream modelPOSTagger = null;
    InputStream modelTokenizer = null;

    POSModel modelPOS = null;
    TokenizerModel modelToken = null;

    StringBuffer sent = new StringBuffer();
    // loading the POSTagger Model
    try {
      modelPOSTagger =
          new FileInputStream(
              "/home/opnchaudhary/androidapps/openNlpWeb/WebContent/mod/en-pos-maxent.bin");
      modelPOS = new POSModel(modelPOSTagger);
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      if (modelPOSTagger != null) {
        try {
          modelPOSTagger.close();
        } catch (Exception e1) {
          e1.printStackTrace();
        }
      }
    }
    // loading the Tokenizer Model
    try {
      modelTokenizer =
          new FileInputStream(
              "/home/opnchaudhary/androidapps/openNlpWeb/WebContent/mod/en-token.bin");
      modelToken = new TokenizerModel(modelTokenizer);
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      if (modelTokenizer != null) {
        try {
          modelTokenizer.close();
        } catch (Exception e1) {
          e1.printStackTrace();
        }
      }
    }
    // POS Tagging
    // Tokenization
    Tokenizer tokenizer = new TokenizerME(modelToken);
    String[] tokens = tokenizer.tokenize(s);

    POSTaggerME posTagger = new POSTaggerME(modelPOS);

    /*
     * This works to some extent. The logic behind this is to take out verb
     * out of the sentence as attribute of the subject of the sentence to
     * form a predicate with object of the sentece as the second parameter
     * in the predicate
     */
    String consts = null;
    String attr = null;
    String consts1 = null;
    int count = 0;
    for (int tempVar = 0; tempVar < tokens.length; tempVar++) {
      String temp = posTagger.tag(tokens[tempVar]);
      String[] temps = temp.split(" ");
      for (int i = 0; i < temps.length; i++) {
        String[] t = temps[i].split("/");
        if (t[1].equals("NN")
            || t[1].equals("NNS")
            || t[1].equals("NNP")
            || t[1].equals("NNPS")
            || t[1].equals("FW")) {
          count++;
          if (count == 1) {
            consts = t[0];
            break;
          }
        }
      }

      for (int i = 0; i < temps.length; i++) {
        String[] t = temps[i].split("/");
        if (t[1].equals("VB") || t[1].equals("VBD") || t[1].equals("VBN") || t[1].equals("VBZ")) {
          attr = t[0];
        }
      }
      for (int i = 0; i < temps.length; i++) {
        String[] t = temps[i].split("/");
        if (t[1].equals("NN")
            || t[1].equals("NNS")
            || t[1].equals("NNP")
            || t[1].equals("NNPS")
            || t[1].equals("FW")
            || t[1].equals("JJ")) {
          consts1 = t[0];
        }
      }
    }
    sent.append(attr + "(" + consts + "," + consts1 + ")");

    return sent.toString();
  }
コード例 #7
0
  public void run(String format, String[] args) {
    super.run(format, args);

    mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), true);
    if (mlParams != null && !TrainerFactory.isValid(mlParams.getSettings())) {
      throw new TerminateToolException(
          1, "Training parameters file '" + params.getParams() + "' is invalid!");
    }

    if (mlParams == null) {
      mlParams = ModelUtil.createDefaultTrainingParameters();
      mlParams.put(TrainingParameters.ALGORITHM_PARAM, getModelType(params.getType()).toString());
    }

    File modelOutFile = params.getModel();
    CmdLineUtil.checkOutputFile("pos tagger model", modelOutFile);

    Dictionary ngramDict = null;

    Integer ngramCutoff = params.getNgram();

    if (ngramCutoff != null) {
      System.err.print("Building ngram dictionary ... ");
      try {
        ngramDict = POSTaggerME.buildNGramDictionary(sampleStream, ngramCutoff);
        sampleStream.reset();
      } catch (IOException e) {
        throw new TerminateToolException(
            -1, "IO error while building NGram Dictionary: " + e.getMessage(), e);
      }
      System.err.println("done");
    }

    POSTaggerFactory postaggerFactory = null;
    try {
      postaggerFactory = POSTaggerFactory.create(params.getFactory(), ngramDict, null);
    } catch (InvalidFormatException e) {
      throw new TerminateToolException(-1, e.getMessage(), e);
    }

    if (params.getDict() != null) {
      try {
        postaggerFactory.setTagDictionary(postaggerFactory.createTagDictionary(params.getDict()));
      } catch (IOException e) {
        throw new TerminateToolException(
            -1, "IO error while loading POS Dictionary: " + e.getMessage(), e);
      }
    }

    if (params.getTagDictCutoff() != null) {
      try {
        TagDictionary dict = postaggerFactory.getTagDictionary();
        if (dict == null) {
          dict = postaggerFactory.createEmptyTagDictionary();
          postaggerFactory.setTagDictionary(dict);
        }
        if (dict instanceof MutableTagDictionary) {
          POSTaggerME.populatePOSDictionary(
              sampleStream, (MutableTagDictionary) dict, params.getTagDictCutoff());
        } else {
          throw new IllegalArgumentException(
              "Can't extend a POSDictionary that does not implement MutableTagDictionary.");
        }
        sampleStream.reset();
      } catch (IOException e) {
        throw new TerminateToolException(
            -1, "IO error while creating/extending POS Dictionary: " + e.getMessage(), e);
      }
    }

    POSModel model;
    try {
      model =
          opennlp.tools.postag.POSTaggerME.train(
              params.getLang(), sampleStream, mlParams, postaggerFactory);
    } catch (IOException e) {
      throw new TerminateToolException(
          -1, "IO error while reading training data or indexing data: " + e.getMessage(), e);
    } finally {
      try {
        sampleStream.close();
      } catch (IOException e) {
        // sorry that this can fail
      }
    }

    CmdLineUtil.writeModel("pos tagger", modelOutFile, model);
  }