示例#1
1
  public static synchronized Dictionary initializeWordNet() {

    if (wordnet != null) return wordnet;

    try {
      String propsFileText = FileUtils.readFile(Utils.class.getResourceAsStream(propsFile));
      Map<String, String> map = Maps.newTreeMap();
      map.put("WordNet_dictionary_path", Utils.getConfig().getString("WordNet_dictionary_path"));
      propsFileText = StringUtil.macroReplace(propsFileText, map);
      JWNL.initialize(new StringInputStream(propsFileText));
      // JWNL.initialize(new FileInputStream(propsFile));
      wordnet = Dictionary.getInstance();
    } catch (Exception ex) {
      throw new RuntimeException(ex);
    }

    SUPERTYPE_SYNSETS = new Synset[SUPERTYPES.length];
    Synset[] classSynset;
    IndexWord iw;
    int count = 0;
    for (String type : SUPERTYPES) {
      try {
        iw = wordnet.getIndexWord(POS.NOUN, type);
      } catch (JWNLException e) {
        throw new RuntimeException(e);
      }
      if (iw == null) {
        System.err.println(type);
        continue;
      }

      try {
        classSynset = iw.getSenses();
      } catch (JWNLException e) {
        throw new RuntimeException(e);
      }
      // System.err.println("**********************");
      if (classSynset.length > 1) {
        // for(Synset cs:classSynset)
        // System.err.println(cs);
        if (type.equals("abstraction")) {
          SUPERTYPE_SYNSETS[count] = classSynset[5];
        } else if (type.equals("measure")) {
          SUPERTYPE_SYNSETS[count] = classSynset[2];
        } else if (type.equals("state")) {
          SUPERTYPE_SYNSETS[count] = classSynset[3];
        } else if (type.equals("act")) {
          SUPERTYPE_SYNSETS[count] = classSynset[1];
        } else {
          SUPERTYPE_SYNSETS[count] = classSynset[0];
        }
      }
      count++;
    }
    if (wordnet == null) throw new RuntimeException("WordNet not intialized");
    else {
      System.out.println("Wordnet initialized " + wordnet);
    }
    return wordnet;
  }
示例#2
0
  private static Set<String> getMaleNames() {
    if (MALE_NAMES == null) {
      InputStream in = Utils.getMaleNames();
      try {
        MALE_NAMES = Utils.readStringsSet(in);
      } catch (Exception e) {
        throw new RuntimeException(e);
      }
    }

    return MALE_NAMES;
  }
示例#3
0
  private static Set<String> getStopwords() {
    if (STOPWORDS == null) {
      InputStream in = Utils.getStopwords();
      try {
        // System.out.println("Reading in stopwords");
        STOPWORDS = Utils.readStringsSet(in);
      } catch (Exception e) {
        throw new RuntimeException(e);
      }
    }

    return STOPWORDS;
  }
  public Classifier runLearner(Corpus trainCorpus, File workDir, String featSetName)
      throws IOException, FileNotFoundException {
    String modelName = cfg.getModelName();
    String model = Utils.getWorkDirectory() + "/" + modelName;
    if (modelName == null)
      throw new RuntimeException("Model name needs to be specified (parameter MODEL_NAME)");
    // SimpleDateFormat nameFormat = new SimpleDateFormat("yyyyMMdd");
    // Date date = new Date();
    // modelName = nameFormat.format(date) + "-" + modelName;
    Classifier classifier = Constructor.createClassifier(model);

    if (featSetName == null)
      throw new RuntimeException(
          "Feature set name needs to be specified (parameter FEAT_SET_NAME)");
    if (modelName == null)
      throw new RuntimeException("Model name needs to be specified (parameter MODEL_NAME)");

    // merge feature files together
    File mergedFeatureVector = File.createTempFile("mergedFeatureVector_", ".csv.gz", workDir);

    OutputStream trainFeatures = new FileOutputStream(mergedFeatureVector);
    FeatureMerger.combine(trainFeatures, trainCorpus);
    System.out.println("start training");
    classifier.train(mergedFeatureVector, new File(workDir, classifier.getName() + ".model"));
    return classifier;
  }
示例#5
0
  @Override
  public String produceValue(
      Annotation np1, Annotation np2, Document doc, Map<Feature, String> featVector) {

    if (!ProperName.getValue(np1, doc) || !ProperName.getValue(np2, doc)) return NA;
    Annotation ne1 = (Annotation) np1.getProperty(Property.LINKED_PROPER_NAME);
    Annotation ne2 = (Annotation) np2.getProperty(Property.LINKED_PROPER_NAME);
    String[] infW1 = InfWords.getValue(ne1, doc);
    String[] infW2 = InfWords.getValue(ne2, doc);
    if (infW1 == null || infW2 == null || infW1.length < 1 || infW2.length < 1) return INCOMPATIBLE;
    if (Utils.isAnySubset(infW1, infW2)) return COMPATIBLE;
    else return INCOMPATIBLE;
  }
  @Override
  public void run(Document doc, String annSetNames[]) {

    String tagChunk = currentConfig.getTagChunk();
    String listDir = currentConfig.getTagChunkLists();

    AnnotationSet namedEntities = new AnnotationSet(annSetNames[0]);

    // get the sentences from the input
    AnnotationSet sentSet = doc.getAnnotationSet(Constants.SENT);

    // get the tokens from each sentence
    AnnotationSet tokenSet = doc.getAnnotationSet(Constants.TOKEN);

    // Read in the text from the raw file
    String text = doc.getText();

    Iterator<Annotation> sents = sentSet.iterator();
    ArrayList<String> lines = new ArrayList<String>();
    ArrayList<Vector<Annotation>> tokenList = new ArrayList<Vector<Annotation>>();

    while (sents.hasNext()) {
      Vector<Annotation> annVector = new Vector<Annotation>();
      Annotation sent = sents.next();
      int sentStart = sent.getStartOffset();
      int sentEnd = sent.getEndOffset();
      String sentText = Annotation.getAnnotText(sent, text);
      AnnotationSet sentTokens = tokenSet.getContained(sentStart, sentEnd);

      // gather all sentences to tag
      if (!sentText.matches("\\W+")) {
        StringBuilder tmp = new StringBuilder();
        for (Annotation a : sentTokens) {
          tmp.append(Annotation.getAnnotTextClean(a, text)).append(" ");
          annVector.add(a);
        }

        lines.add(tmp.toString());
        tokenList.add(annVector);
      }
    }

    // write out a tmp file that contains the words to be tagged
    File tmpFile = new File(doc.getRootDir(), "tmp.ner");
    try {
      tmpFile.deleteOnExit();
      FileWriter fw = new FileWriter(tmpFile);
      BufferedWriter bw = new BufferedWriter(fw);
      for (String l : lines) {
        // System.out.println(l);
        bw.write(l + "\n");
      }

      bw.close();
      fw.close();
    } catch (IOException ioe) {
      ioe.printStackTrace();
    }

    // run the tagger
    String command =
        tagChunk
            + " -predict . "
            + modelDir
            + Utils.SEPARATOR
            + models[0]
            + " "
            + tmpFile.getAbsolutePath()
            + " "
            + listDir;

    // collect the results
    ArrayList<String> results;
    int i = 0;
    try {
      results = Utils.runExternalCaptureOutput(command);
      Annotation current = null;
      for (String l : results) {
        Vector<Annotation> annVector = tokenList.get(i);

        // get rid of these extraneous tags
        l = l.replace("_O-O", "");
        String[] tokens = l.split(" ");
        // System.out.println(l);

        int j = 0;
        int underscore;
        int nes = 1;
        String tag;
        for (String t : tokens) {
          underscore = t.lastIndexOf('_');
          tag = t.substring(underscore + 1, t.length());
          Annotation a = annVector.get(j);
          // System.out.print(Utils.getAnnotTextClean(a, text) + "_" + tag + " ");

          if (tag.equals("B-O")) {
            j++;
            if (current != null) {
              namedEntities.add(current);
              // System.out.println("NE Found: " + Utils.getAnnotTextClean(current, text) + ":" +
              // current.getType());
              nes++;
              current = null;
            }

            continue;
          }

          String entityType = tag.substring(tag.indexOf("-") + 1, tag.length());

          if (entityType.equals("ORG")) {
            entityType = "ORGANIZATION";
          } else if (entityType.equals("LOC")) {
            entityType = "LOCATION";
          } else if (entityType.equals("PER")) {
            entityType = "PERSON";
          } else if (entityType.equals("VEH")) {
            entityType = "VEHICLE";
          }

          if (tag.startsWith("B-")) {
            if (current != null) {
              namedEntities.add(current);
              nes++;
              current = null;
              // System.out.println("NE Found: " + Utils.getAnnotTextClean(current, text));
            }

            current = new Annotation(nes, a.getStartOffset(), a.getEndOffset(), entityType);
          } else if (tag.startsWith("I-")) {
            if (current != null) {
              current.setEndOffset(a.getEndOffset());
            } else {
              current = new Annotation(nes, a.getStartOffset(), a.getEndOffset(), entityType);
            }
          }

          j++;
        }

        // System.out.println();
        i++;
      }
      FileUtils.delete(tmpFile);
    } catch (IOException e) {
      throw new RuntimeException(e);
    } catch (InterruptedException e) {
      throw new RuntimeException(e);
    }

    addResultSet(doc, namedEntities);
  }
 public NamedEntityTagChunk() {
   /* defaults */
   currentConfig = Utils.getConfig();
   modelDir = Utils.getDataDirectory();
   models = currentConfig.getNERModels("tagchunkmodels");
 }
 /**
  * Trains a classifier using the feature files housed in the training directories in the Config.
  *
  * @param options - a string array of various options used in training (e.g. - where to save the
  *     model file, training parameters, etc.)
  */
 public void train(File trainFile, File outputModelFile) {
   if (mOptions == null) {
     mOptions = Utils.getConfig().getStringArray("ClOptions." + getName());
   }
   train(trainFile, outputModelFile, mOptions);
 }
 /**
  * Classifies the instances located in the test directories in the Config.
  *
  * @param options - a string array of various options used in testing (e.g. - where to load the
  *     model file, testing parameters, etc.)
  * @return the minimum and maximum numerical values of the classified instances
  */
 public double[] test(Reader testFile, Writer outputFile) {
   if (mOptions == null) {
     mOptions = Utils.getConfig().getStringArray("ClOptions." + getName());
   }
   return test(testFile, outputFile, mOptions);
 }