public List<? extends HasWord> defaultTestSentence() {
   String[] sent = {
     "H", "MWX", "MTPLC", "LA", "RQ", "M", "H", "TWPEH", "H", "MBIFH", "ALA", "GM", "M", "DRKI",
     "H", "HERMH", "yyDOT"
   };
   return Sentence.toWordList(sent);
 }
Ejemplo n.º 2
0
  public static void main(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
      return;
    }
    MaxentTagger tagger = new MaxentTagger(args[0]);
    TokenizerFactory<CoreLabel> ptbTokenizerFactory =
        PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
    BufferedReader r =
        new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));

    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
    for (List<HasWord> sentence : documentPreprocessor) {
      List<TaggedWord> tSentence = tagger.tagSentence(sentence);
      pw.println(Sentence.listToString(tSentence, false));
    }

    // print the adjectives in one more sentence. This shows how to get at words and tags in a
    // tagged sentence.
    List<HasWord> sent =
        Sentence.toWordList(
            "The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
    List<TaggedWord> taggedSent = tagger.tagSentence(sent);
    for (TaggedWord tw : taggedSent) {
      if (tw.tag().startsWith("JJ")) {
        pw.println(tw.word());
      }
    }

    pw.close();
  }
  public static void generate(String model, String fileToTag, String outfile) throws Exception {

    MaxentTagger tagger = new MaxentTagger(model);
    PrintWriter pw =
        new PrintWriter(new OutputStreamWriter(new FileOutputStream(outfile), "utf-8"));

    BufferedReader br = new BufferedReader(new FileReader(fileToTag));
    String line = "";
    ArrayList<String> toks = new ArrayList<>();
    while ((line = br.readLine()) != null) {
      if (line.length() == 0) {
        String params[] = new String[toks.size()];
        toks.toArray(params);
        List<HasWord> sent = Sentence.toWordList(params);
        List<TaggedWord> taggedSent = tagger.tagSentence(sent);
        for (TaggedWord tw : taggedSent) {
          pw.println(tw.word() + " " + tw.tag());
        }
        pw.println();
        toks = new ArrayList<>();
      } else {
        toks.add(line);
      }
    }
    br.close();
    pw.close();
  }
Ejemplo n.º 4
0
  public ArrayList<String> getKeyWordsDependency(String sentence, String keyword) {
    LexicalizedParser lp =
        LexicalizedParser.loadModel(
            "/home/mingrui/Desktop/englishPCFG.ser.gz",
            "-maxLength",
            "80",
            "-retainTmpSubcategories");
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    // Uncomment the following line to obtain original Stanford Dependencies
    // tlp.setGenerateOriginalDependencies(true);
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    String[] array = sentence.split("\\s+");
    Tree parse = lp.apply(Sentence.toWordList(array));
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    ArrayList<String> keywordsDependency = new ArrayList<String>();
    ArrayList<String> keywordsDependencyWithLemmatization = new ArrayList<String>();
    // String lemmatizedKeyword = lemmatize(keyword);
    for (TypedDependency t : tdl) {
      String d = t.toString();
      String dependencyType = d.substring(0, d.indexOf("("));
      String pair = d.substring(d.indexOf("(") + 1, d.indexOf("("));
      String[] terms = pair.split(",");
      String term1 = terms[0].trim();
      String term2 = terms[1].trim();

      // Match keywords with the terms in the tuples, if matched, add the
      // tuple into the arraylist
      String[] wordsplitted = keyword.split(" ");
      for (String key : wordsplitted) {
        if (term1.equals(key)) {
          keywordsDependency.add(t.toString());
        }
        if (term2.equals(key)) {
          keywordsDependency.add(t.toString());
        }
      }
    }

    String lemmatizedKeywords = lemmatize(keyword);
    int lbefore = keyword.split(" ").length;
    int lafter = lemmatizedKeywords.split(" ").length;
    if (lbefore == lafter) {
      return keywordsDependency;
    } else {
      String[] split = keyword.split(" ");
      for (String s : split) {
        String[] lemmas = lemmatize(s).split(" ");
        boolean sameLength = lemmas.length == s.split(" ").length;
        if (sameLength) { // Compare the length of one key_word or key_phrase before and after
                          // lemmatization
          continue;
        } else {
          for (String tuple : keywordsDependency) {
            if (getTupleTerms(tuple)[0].equals(
                s)) { // Find the tuple that contains the original keyword/key_phrase
              String dependent = getTupleTerms(tuple)[1];
              // String[]
            }
          }
          // for(String l : lemma)
        }
      }
      return keywordsDependencyWithLemmatization;
    }
  }