public static synchronized Dictionary initializeWordNet() { if (wordnet != null) return wordnet; try { String propsFileText = FileUtils.readFile(Utils.class.getResourceAsStream(propsFile)); Map<String, String> map = Maps.newTreeMap(); map.put("WordNet_dictionary_path", Utils.getConfig().getString("WordNet_dictionary_path")); propsFileText = StringUtil.macroReplace(propsFileText, map); JWNL.initialize(new StringInputStream(propsFileText)); // JWNL.initialize(new FileInputStream(propsFile)); wordnet = Dictionary.getInstance(); } catch (Exception ex) { throw new RuntimeException(ex); } SUPERTYPE_SYNSETS = new Synset[SUPERTYPES.length]; Synset[] classSynset; IndexWord iw; int count = 0; for (String type : SUPERTYPES) { try { iw = wordnet.getIndexWord(POS.NOUN, type); } catch (JWNLException e) { throw new RuntimeException(e); } if (iw == null) { System.err.println(type); continue; } try { classSynset = iw.getSenses(); } catch (JWNLException e) { throw new RuntimeException(e); } // System.err.println("**********************"); if (classSynset.length > 1) { // for(Synset cs:classSynset) // System.err.println(cs); if (type.equals("abstraction")) { SUPERTYPE_SYNSETS[count] = classSynset[5]; } else if (type.equals("measure")) { SUPERTYPE_SYNSETS[count] = classSynset[2]; } else if (type.equals("state")) { SUPERTYPE_SYNSETS[count] = classSynset[3]; } else if (type.equals("act")) { SUPERTYPE_SYNSETS[count] = classSynset[1]; } else { SUPERTYPE_SYNSETS[count] = classSynset[0]; } } count++; } if (wordnet == null) throw new RuntimeException("WordNet not intialized"); else { System.out.println("Wordnet initialized " + wordnet); } return wordnet; }
private static Set<String> getMaleNames() { if (MALE_NAMES == null) { InputStream in = Utils.getMaleNames(); try { MALE_NAMES = Utils.readStringsSet(in); } catch (Exception e) { throw new RuntimeException(e); } } return MALE_NAMES; }
private static Set<String> getStopwords() { if (STOPWORDS == null) { InputStream in = Utils.getStopwords(); try { // System.out.println("Reading in stopwords"); STOPWORDS = Utils.readStringsSet(in); } catch (Exception e) { throw new RuntimeException(e); } } return STOPWORDS; }
public Classifier runLearner(Corpus trainCorpus, File workDir, String featSetName) throws IOException, FileNotFoundException { String modelName = cfg.getModelName(); String model = Utils.getWorkDirectory() + "/" + modelName; if (modelName == null) throw new RuntimeException("Model name needs to be specified (parameter MODEL_NAME)"); // SimpleDateFormat nameFormat = new SimpleDateFormat("yyyyMMdd"); // Date date = new Date(); // modelName = nameFormat.format(date) + "-" + modelName; Classifier classifier = Constructor.createClassifier(model); if (featSetName == null) throw new RuntimeException( "Feature set name needs to be specified (parameter FEAT_SET_NAME)"); if (modelName == null) throw new RuntimeException("Model name needs to be specified (parameter MODEL_NAME)"); // merge feature files together File mergedFeatureVector = File.createTempFile("mergedFeatureVector_", ".csv.gz", workDir); OutputStream trainFeatures = new FileOutputStream(mergedFeatureVector); FeatureMerger.combine(trainFeatures, trainCorpus); System.out.println("start training"); classifier.train(mergedFeatureVector, new File(workDir, classifier.getName() + ".model")); return classifier; }
@Override public String produceValue( Annotation np1, Annotation np2, Document doc, Map<Feature, String> featVector) { if (!ProperName.getValue(np1, doc) || !ProperName.getValue(np2, doc)) return NA; Annotation ne1 = (Annotation) np1.getProperty(Property.LINKED_PROPER_NAME); Annotation ne2 = (Annotation) np2.getProperty(Property.LINKED_PROPER_NAME); String[] infW1 = InfWords.getValue(ne1, doc); String[] infW2 = InfWords.getValue(ne2, doc); if (infW1 == null || infW2 == null || infW1.length < 1 || infW2.length < 1) return INCOMPATIBLE; if (Utils.isAnySubset(infW1, infW2)) return COMPATIBLE; else return INCOMPATIBLE; }
@Override public void run(Document doc, String annSetNames[]) { String tagChunk = currentConfig.getTagChunk(); String listDir = currentConfig.getTagChunkLists(); AnnotationSet namedEntities = new AnnotationSet(annSetNames[0]); // get the sentences from the input AnnotationSet sentSet = doc.getAnnotationSet(Constants.SENT); // get the tokens from each sentence AnnotationSet tokenSet = doc.getAnnotationSet(Constants.TOKEN); // Read in the text from the raw file String text = doc.getText(); Iterator<Annotation> sents = sentSet.iterator(); ArrayList<String> lines = new ArrayList<String>(); ArrayList<Vector<Annotation>> tokenList = new ArrayList<Vector<Annotation>>(); while (sents.hasNext()) { Vector<Annotation> annVector = new Vector<Annotation>(); Annotation sent = sents.next(); int sentStart = sent.getStartOffset(); int sentEnd = sent.getEndOffset(); String sentText = Annotation.getAnnotText(sent, text); AnnotationSet sentTokens = tokenSet.getContained(sentStart, sentEnd); // gather all sentences to tag if (!sentText.matches("\\W+")) { StringBuilder tmp = new StringBuilder(); for (Annotation a : sentTokens) { tmp.append(Annotation.getAnnotTextClean(a, text)).append(" "); annVector.add(a); } lines.add(tmp.toString()); tokenList.add(annVector); } } // write out a tmp file that contains the words to be tagged File tmpFile = new File(doc.getRootDir(), "tmp.ner"); try { tmpFile.deleteOnExit(); FileWriter fw = new FileWriter(tmpFile); BufferedWriter bw = new BufferedWriter(fw); for (String l : lines) { // System.out.println(l); bw.write(l + "\n"); } bw.close(); fw.close(); } catch (IOException ioe) { ioe.printStackTrace(); } // run the tagger String command = tagChunk + " -predict . " + modelDir + Utils.SEPARATOR + models[0] + " " + tmpFile.getAbsolutePath() + " " + listDir; // collect the results ArrayList<String> results; int i = 0; try { results = Utils.runExternalCaptureOutput(command); Annotation current = null; for (String l : results) { Vector<Annotation> annVector = tokenList.get(i); // get rid of these extraneous tags l = l.replace("_O-O", ""); String[] tokens = l.split(" "); // System.out.println(l); int j = 0; int underscore; int nes = 1; String tag; for (String t : tokens) { underscore = t.lastIndexOf('_'); tag = t.substring(underscore + 1, t.length()); Annotation a = annVector.get(j); // System.out.print(Utils.getAnnotTextClean(a, text) + "_" + tag + " "); if (tag.equals("B-O")) { j++; if (current != null) { namedEntities.add(current); // System.out.println("NE Found: " + Utils.getAnnotTextClean(current, text) + ":" + // current.getType()); nes++; current = null; } continue; } String entityType = tag.substring(tag.indexOf("-") + 1, tag.length()); if (entityType.equals("ORG")) { entityType = "ORGANIZATION"; } else if (entityType.equals("LOC")) { entityType = "LOCATION"; } else if (entityType.equals("PER")) { entityType = "PERSON"; } else if (entityType.equals("VEH")) { entityType = "VEHICLE"; } if (tag.startsWith("B-")) { if (current != null) { namedEntities.add(current); nes++; current = null; // System.out.println("NE Found: " + Utils.getAnnotTextClean(current, text)); } current = new Annotation(nes, a.getStartOffset(), a.getEndOffset(), entityType); } else if (tag.startsWith("I-")) { if (current != null) { current.setEndOffset(a.getEndOffset()); } else { current = new Annotation(nes, a.getStartOffset(), a.getEndOffset(), entityType); } } j++; } // System.out.println(); i++; } FileUtils.delete(tmpFile); } catch (IOException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } addResultSet(doc, namedEntities); }
public NamedEntityTagChunk() { /* defaults */ currentConfig = Utils.getConfig(); modelDir = Utils.getDataDirectory(); models = currentConfig.getNERModels("tagchunkmodels"); }
/** * Trains a classifier using the feature files housed in the training directories in the Config. * * @param options - a string array of various options used in training (e.g. - where to save the * model file, training parameters, etc.) */ public void train(File trainFile, File outputModelFile) { if (mOptions == null) { mOptions = Utils.getConfig().getStringArray("ClOptions." + getName()); } train(trainFile, outputModelFile, mOptions); }
/** * Classifies the instances located in the test directories in the Config. * * @param options - a string array of various options used in testing (e.g. - where to load the * model file, testing parameters, etc.) * @return the minimum and maximum numerical values of the classified instances */ public double[] test(Reader testFile, Writer outputFile) { if (mOptions == null) { mOptions = Utils.getConfig().getStringArray("ClOptions." + getName()); } return test(testFile, outputFile, mOptions); }