// get signs with additional args for a known special token const, target pred and target rel private SignHash getSignsFromWord( Word w, String specialTokenConst, String targetPred, String targetRel) throws LexException { Collection<MorphItem> morphItems = (specialTokenConst == null) ? (Collection<MorphItem>) _words.get(w) : null; if (morphItems == null) { // check for special tokens if (specialTokenConst == null) { specialTokenConst = tokenizer.getSpecialTokenConstant(tokenizer.isSpecialToken(w.getForm())); targetPred = w.getForm(); } if (specialTokenConst != null) { Word key = Word.createSurfaceWord(w, specialTokenConst); morphItems = (Collection<MorphItem>) _words.get(key); } // otherwise throw lex exception if (morphItems == null) throw new LexException(w + " not in lexicon"); } SignHash result = new SignHash(); for (Iterator<MorphItem> MI = morphItems.iterator(); MI.hasNext(); ) { getWithMorphItem(w, MI.next(), targetPred, targetRel, result); } return result; }
// returns a macro adder for the given morph item private MacroAdder getMacAdder(MorphItem mi) { // check map MacroAdder retval = macAdderMap.get(mi); if (retval != null) return retval; // set up macro adder IntHashSetMap macrosFromLex = new IntHashSetMap(); String[] newMacroNames = mi.getMacros(); List<MacroItem> macroItems = new ArrayList<MacroItem>(); for (int i = 0; i < newMacroNames.length; i++) { Set<FeatureStructure> featStrucs = (Set<FeatureStructure>) _macros.get(newMacroNames[i]); if (featStrucs != null) { for (Iterator<FeatureStructure> fsIt = featStrucs.iterator(); fsIt.hasNext(); ) { FeatureStructure fs = fsIt.next(); macrosFromLex.put(fs.getIndex(), fs); } } MacroItem macroItem = _macroItems.get(newMacroNames[i]); if (macroItem != null) { macroItems.add(macroItem); } else { // should be checked earlier too System.err.println( "Warning: macro " + newMacroNames[i] + " not found for word '" + mi.getWord() + "'"); } } retval = new MacroAdder(macrosFromLex, macroItems); // update map and return macAdderMap.put(mi, retval); return retval; }
// look up and apply coarts for w to each sign in result @SuppressWarnings("unchecked") private void applyCoarts(Word w, SignHash result) throws LexException { List<Sign> inputSigns = new ArrayList<Sign>(result.asSignSet()); result.clear(); List<Sign> outputSigns = new ArrayList<Sign>(inputSigns.size()); // for each surface attr, lookup coarts and apply to input signs, storing results in output // signs for (Iterator<Pair<String, String>> it = w.getSurfaceAttrValPairs(); it.hasNext(); ) { Pair<String, String> p = it.next(); String attr = (String) p.a; if (!_indexedCoartAttrs.contains(attr)) continue; String val = (String) p.b; Word coartWord = Word.createWord(attr, val); SignHash coartResult = getSignsFromWord(coartWord, null, null, null); for (Iterator<Sign> it2 = coartResult.iterator(); it2.hasNext(); ) { Sign coartSign = it2.next(); // apply to each input for (int j = 0; j < inputSigns.size(); j++) { Sign sign = inputSigns.get(j); grammar.rules.applyCoart(sign, coartSign, outputSigns); } } // switch output to input for next iteration inputSigns.clear(); inputSigns.addAll(outputSigns); outputSigns.clear(); } // add results back result.addAll(inputSigns); }
// get signs for rel via preds, or null if none private Collection<Sign> getSignsFromRelAndPreds(String rel, Collection<String> preds) { List<Sign> retval = new ArrayList<Sign>(); for (Iterator<String> it = preds.iterator(); it.hasNext(); ) { String pred = it.next(); Collection<Sign> signs = getSignsFromPredAndTargetRel(pred, rel); if (signs != null) retval.addAll(signs); } // return null if none survive filter if (retval.size() > 0) return retval; else return null; }
// get licensing features, with appropriate defaults @SuppressWarnings("unchecked") private void loadLicensingFeatures(Element licensingElt) { List<LicensingFeature> licensingFeats = new ArrayList<LicensingFeature>(); boolean containsLexFeat = false; if (licensingElt != null) { for (Iterator<Element> it = licensingElt.getChildren("feat").iterator(); it.hasNext(); ) { Element featElt = it.next(); String attr = featElt.getAttributeValue("attr"); if (attr.equals("lex")) containsLexFeat = true; String val = featElt.getAttributeValue("val"); List<String> alsoLicensedBy = null; String alsoVals = featElt.getAttributeValue("also-licensed-by"); if (alsoVals != null) { alsoLicensedBy = Arrays.asList(alsoVals.split("\\s+")); } boolean licenseEmptyCats = true; boolean licenseMarkedCats = false; boolean instantiate = true; byte loc = LicensingFeature.BOTH; String lmc = featElt.getAttributeValue("license-marked-cats"); if (lmc != null) { licenseMarkedCats = Boolean.valueOf(lmc).booleanValue(); // change defaults licenseEmptyCats = false; loc = LicensingFeature.TARGET_ONLY; instantiate = false; } String lec = featElt.getAttributeValue("license-empty-cats"); if (lec != null) { licenseEmptyCats = Boolean.valueOf(lec).booleanValue(); } String inst = featElt.getAttributeValue("instantiate"); if (inst != null) { instantiate = Boolean.valueOf(inst).booleanValue(); } String locStr = featElt.getAttributeValue("location"); if (locStr != null) { if (locStr.equals("target-only")) loc = LicensingFeature.TARGET_ONLY; if (locStr.equals("args-only")) loc = LicensingFeature.ARGS_ONLY; if (locStr.equals("both")) loc = LicensingFeature.BOTH; } licensingFeats.add( new LicensingFeature( attr, val, alsoLicensedBy, licenseEmptyCats, licenseMarkedCats, instantiate, loc)); } } if (!containsLexFeat) { licensingFeats.add(LicensingFeature.defaultLexFeature); } _licensingFeatures = new LicensingFeature[licensingFeats.size()]; licensingFeats.toArray(_licensingFeatures); }
/** * For a string of 1 or more surface words, return all of the lexical entries for each word as a * list of sign hashes. Tokenization is performed using the configured tokenizer. * * @param w the words in string format * @return a list of sign hashes * @exception LexException thrown if word not found */ public List<SignHash> getEntriesFromWords(String s) throws LexException { List<SignHash> entries = new ArrayList<SignHash>(); List<Word> words = tokenizer.tokenize(s); for (Iterator<Word> it = words.iterator(); it.hasNext(); ) { Word w = it.next(); SignHash signs = getSignsFromWord(w); if (signs.size() == 0) { throw new LexException("Word not in lexicon: \"" + w + "\""); } entries.add(signs); } return entries; }
public void forall(Category c) { if (!(c instanceof AtomCat)) return; String type = ((AtomCat) c).getType(); FeatureStructure fs = c.getFeatureStructure(); if (fs == null) return; for (Iterator<String> it = fs.getAttributes().iterator(); it.hasNext(); ) { String att = it.next(); _catsToAttrs.put(type, att); if (fs.getValue(att) instanceof LF) { _lfAttrs.add(att); } } }
public void modify(Mutable m) { if (m instanceof Proposition) { Proposition prop = (Proposition) m; if (prop.getName().equals(DEFAULT_VAL)) prop.setAtomName(REPLACEMENT); } else if (m instanceof FeatureStructure) { FeatureStructure fs = (FeatureStructure) m; for (Iterator<String> it = fs.getAttributes().iterator(); it.hasNext(); ) { String attr = it.next(); Object val = fs.getValue(attr); if (val instanceof SimpleType && ((SimpleType) val).getName().equals(DEFAULT_VAL)) { fs.setFeature(attr, grammar.types.getSimpleType(REPLACEMENT)); } } } }
// look up and apply coarts for given rels to each sign in result private void applyCoarts(List<String> coartRels, Collection<Sign> result) { List<Sign> inputSigns = new ArrayList<Sign>(result); result.clear(); List<Sign> outputSigns = new ArrayList<Sign>(inputSigns.size()); // for each rel, lookup coarts and apply to input signs, storing results in output signs for (Iterator<String> it = coartRels.iterator(); it.hasNext(); ) { String rel = it.next(); Collection<String> preds = (Collection<String>) _coartRelsToPreds.get(rel); if (preds == null) continue; // not expected Collection<Sign> coartResult = getSignsFromRelAndPreds(rel, preds); if (coartResult == null) continue; for (Iterator<Sign> it2 = coartResult.iterator(); it2.hasNext(); ) { Sign coartSign = it2.next(); // apply to each input for (int j = 0; j < inputSigns.size(); j++) { Sign sign = inputSigns.get(j); grammar.rules.applyCoart(sign, coartSign, outputSigns); } } // switch output to input for next iteration inputSigns.clear(); inputSigns.addAll(outputSigns); outputSigns.clear(); } // add results back result.addAll(inputSigns); }
// get signs using an additional arg for a target rel private Collection<Sign> getSignsFromPredAndTargetRel(String pred, String targetRel) { Collection<Word> words = (Collection<Word>) _predToWords.get(pred); String specialTokenConst = null; // for robustness, when using supertagger, add words for pred sans sense index int dotIndex = -1; if (_supertagger != null && !Character.isDigit(pred.charAt(0)) && // skip numbers (dotIndex = pred.lastIndexOf('.')) > 0 && pred.length() > dotIndex + 1 && pred.charAt(dotIndex + 1) != '_') // skip titles, eg Mr._Smith { String barePred = pred.substring(0, dotIndex); Collection<Word> barePredWords = (Collection<Word>) _predToWords.get(barePred); if (words == null) words = barePredWords; else if (barePredWords != null) { Set<Word> unionWords = new HashSet<Word>(words); unionWords.addAll(barePredWords); words = unionWords; } } if (words == null) { specialTokenConst = tokenizer.getSpecialTokenConstant(tokenizer.isSpecialToken(pred)); if (specialTokenConst == null) return null; // lookup words with pred = special token const Collection<Word> specialTokenWords = (Collection<Word>) _predToWords.get(specialTokenConst); // replace special token const with pred if (specialTokenWords == null) return null; words = new ArrayList<Word>(specialTokenWords.size()); for (Iterator<Word> it = specialTokenWords.iterator(); it.hasNext(); ) { Word stw = it.next(); Word w = Word.createSurfaceWord(stw, pred); words.add(w); } } List<Sign> retval = new ArrayList<Sign>(); for (Iterator<Word> it = words.iterator(); it.hasNext(); ) { Word w = it.next(); try { SignHash signs = getSignsFromWord(w, specialTokenConst, pred, targetRel); retval.addAll(signs.asSignSet()); } // shouldn't happen catch (LexException exc) { System.err.println("Unexpected lex exception for word " + w + ": " + exc); } } return retval; }
public void forall(Category c) { // get feature structures if (!(c instanceof AtomCat)) return; String type = ((AtomCat) c).getType(); FeatureStructure fs = c.getFeatureStructure(); GFeatStruc gfs = (GFeatStruc) fs; if (gfs == null || gfs.getInheritsFrom() == 0) return; int inhf = gfs.getInheritsFrom(); FeatureStructure inhfFS = (FeatureStructure) featStrucMap.get(inhf); if (inhfFS != null) { // copy values of features from inhfFS not already present for (Iterator<String> it = inhfFS.getAttributes().iterator(); it.hasNext(); ) { String att = it.next(); if (gfs.hasAttribute(att)) continue; gfs.setFeature(att, UnifyControl.copy(inhfFS.getValue(att))); } // for each possible attr used with this type and not already present, // add feature equation Collection<String> attrs = (Collection<String>) _catsToAttrs.get(type); if (attrs == null) return; for (Iterator<String> it = attrs.iterator(); it.hasNext(); ) { String att = it.next(); if (gfs.hasAttribute(att)) continue; String varName = att.toUpperCase() + inhf; if (_lfAttrs.contains(att)) { gfs.setFeature(att, new HyloVar(varName)); inhfFS.setFeature(att, new HyloVar(varName)); } else { gfs.setFeature(att, new GFeatVar(varName)); inhfFS.setFeature(att, new GFeatVar(varName)); } } } else { System.err.println( "Warning: no feature structure with inheritsFrom index of " + inhf + " found in category " + c); } }
// given MorphItem private void getWithMorphItem( Word w, MorphItem mi, String targetPred, String targetRel, SignHash result) throws LexException { // get supertags for filtering, if a supertagger is installed Map<String, Double> supertags = null; Set<String> supertagsFound = null; if (_supertagger != null) { supertags = _supertagger.getSupertags(); if (supertags != null) supertagsFound = new HashSet<String>(supertags.size()); } // get macro adder MacroAdder macAdder = getMacAdder(mi); // if we have this stem in our lexicon String stem = mi.getWord().getStem(); String pos = mi.getWord().getPOS(); Set<EntriesItem[]> explicitEntries = null; // for storing entries from explicitly listed family members if (_stems.containsKey(stem + pos)) { explicitEntries = new HashSet<EntriesItem[]>(); Collection<Object> stemItems = (Collection<Object>) _stems.get(stem + pos); for (Iterator<Object> I = stemItems.iterator(); I.hasNext(); ) { Object item = I.next(); // see if it's an EntriesItem if (item instanceof EntriesItem) { EntriesItem entry = (EntriesItem) item; // do lookup getWithEntriesItem( w, mi, stem, stem, targetPred, targetRel, entry, macAdder, supertags, supertagsFound, result); } // otherwise it has to be a Pair containing a DataItem and // an EntriesItem[] else { @SuppressWarnings("rawtypes") DataItem dItem = (DataItem) ((Pair) item).a; @SuppressWarnings("rawtypes") EntriesItem[] entries = (EntriesItem[]) ((Pair) item).b; // store entries explicitEntries.add(entries); // do lookup getWithDataItem( w, mi, dItem, entries, targetPred, targetRel, macAdder, supertags, supertagsFound, result); } } } // for entries that are not explicitly in the lexicon file, we have to create // Signs from the open class entries with the appropriate part-of-speech Collection<EntriesItem[]> entrySets = (Collection<EntriesItem[]>) _posToEntries.get(pos); if (entrySets != null) { for (Iterator<EntriesItem[]> E = entrySets.iterator(); E.hasNext(); ) { EntriesItem[] entries = E.next(); // skip if entries explicitly listed if (explicitEntries != null && explicitEntries.contains(entries)) continue; // otherwise get entries with pred = targetPred, or stem if null String pred = (targetPred != null) ? targetPred : stem; getWithDataItem( w, mi, new DataItem(stem, pred), entries, targetPred, targetRel, macAdder, supertags, supertagsFound, result); } } // finally do entries for any remaining supertags if (supertags != null) { for (String supertag : supertags.keySet()) { if (supertagsFound.contains(supertag)) continue; Set<EntriesItem> entries = _stagToEntries.get(supertag + pos); if (entries == null) continue; // nb: could be a POS mismatch // get entries with pred = targetPred, or stem if null String pred = (targetPred != null) ? targetPred : stem; for (EntriesItem entry : entries) { if (!entry.getStem().equals(DEFAULT_VAL)) continue; getWithEntriesItem( w, mi, stem, pred, targetPred, targetRel, entry, macAdder, supertags, supertagsFound, result); } } } }
/** Loads the lexicon and morph files. */ public void init(URL lexiconUrl, URL morphUrl) throws IOException { List<Family> lexicon = null; List<MorphItem> morph = null; List<MacroItem> macroModel = null; // load category families (lexicon), morph forms and macros lexicon = getLexicon(lexiconUrl); Pair<List<MorphItem>, List<MacroItem>> morphInfo = getMorph(morphUrl); morph = morphInfo.a; macroModel = morphInfo.b; // index words; also index stems to words, as default preds // store indexed coarticulation attrs too _words = new GroupMap<Word, MorphItem>(); _predToWords = new GroupMap<String, Word>(); _coartAttrs = new HashSet<String>(); _indexedCoartAttrs = new HashSet<String>(); for (MorphItem morphItem : morph) { Word surfaceWord = morphItem.getSurfaceWord(); _words.put(surfaceWord, morphItem); _predToWords.put(morphItem.getWord().getStem(), surfaceWord); if (morphItem.isCoart()) { Word indexingWord = morphItem.getCoartIndexingWord(); _words.put(indexingWord, morphItem); Pair<String, String> first = indexingWord.getSurfaceAttrValPairs().next(); _indexedCoartAttrs.add(first.a); for (Iterator<Pair<String, String>> it = surfaceWord.getSurfaceAttrValPairs(); it.hasNext(); ) { Pair<String, String> p = it.next(); _coartAttrs.add(p.a); } } } // index entries based on stem+pos _stems = new GroupMap<String, Object>(); _posToEntries = new GroupMap<String, EntriesItem[]>(); // index entries by supertag+pos, for supertagging _stagToEntries = new GroupMap<String, EntriesItem>(); // also index rels and coart rels to preds _relsToPreds = new GroupMap<String, String>(); _coartRelsToPreds = new GroupMap<String, String>(); // and gather list of attributes used per atomic category type _catsToAttrs = new GroupMap<String, String>(); _lfAttrs = new HashSet<String>(); // and remember family and ent, names, for checking excluded list on morph items HashSet<String> familyAndEntryNames = new HashSet<String>(); // index each family for (Family family : lexicon) { familyAndEntryNames.add(family.getName()); EntriesItem[] entries = family.getEntries(); DataItem[] data = family.getData(); // for generic use when we get an unknown stem // from the morphological analyzer if (!family.isClosed()) { _posToEntries.put(family.getPOS(), entries); } // scan through entries for (int j = 0; j < entries.length; j++) { // index EntriesItem eItem = entries[j]; _stagToEntries.put(eItem.getSupertag() + family.getPOS(), eItem); if (eItem.getStem().length() > 0) { _stems.put(eItem.getStem() + family.getPOS(), eItem); } try { // gather features eItem.getCat().forall(gatherAttrs); // record names familyAndEntryNames.add(eItem.getName()); familyAndEntryNames.add(eItem.getQualifiedName()); } catch (RuntimeException exc) { System.err.println("exception for: " + family.getName() + ": " + exc); } } // scan through data for (int j = 0; j < data.length; j++) { DataItem dItem = data[j]; _stems.put( dItem.getStem() + family.getPOS(), new Pair<DataItem, EntriesItem[]>(dItem, entries)); // index non-default preds to words if (!dItem.getStem().equals(dItem.getPred())) { Collection<Word> words = (Collection<Word>) _predToWords.get(dItem.getStem()); if (words == null) { if (!openlex) { System.out.print("Warning: couldn't find words for pred '"); System.out.println(dItem.getPred() + "' with stem '" + dItem.getStem() + "'"); } } else { for (Iterator<Word> it = words.iterator(); it.hasNext(); ) { _predToWords.put(dItem.getPred(), it.next()); } } } } // index rels to preds // nb: this covers relational (eg @x<GenRel>e) and featural (eg @e<tense>past) // elementary predications List<String> indexRels = new ArrayList<String>(3); String familyIndexRel = family.getIndexRel(); if (familyIndexRel.length() > 0) { indexRels.add(familyIndexRel); } for (int j = 0; j < entries.length; j++) { EntriesItem eItem = entries[j]; String indexRel = eItem.getIndexRel(); if (indexRel.length() > 0 && !indexRel.equals(familyIndexRel)) { indexRels.add(indexRel); } } for (Iterator<String> it = indexRels.iterator(); it.hasNext(); ) { String indexRel = it.next(); // nb: not indexing on entries items, b/c some stems are still defaults for (int j = 0; j < data.length; j++) { DataItem dItem = data[j]; _relsToPreds.put(indexRel, dItem.getPred()); } } // index coart rels (features, really) to preds String coartRel = family.getCoartRel(); if (coartRel.length() > 0) { for (int j = 0; j < data.length; j++) { _coartRelsToPreds.put(coartRel, data[j].getPred()); } } } // index the macros _macros = new GroupMap<String, FeatureStructure>(); // nb: could just index MacroItem objects for feature structures too; // this might be a bit cleaner, but life is short _macroItems = new HashMap<String, MacroItem>(); for (MacroItem mi : macroModel) { String macName = mi.getName(); FeatureStructure[] specs = mi.getFeatureStructures(); for (int j = 0; j < specs.length; j++) { _macros.put(macName, specs[j]); } // this is for handling LF part of macros _macroItems.put(macName, mi); } // with morph items, check POS, macro names, excluded list for xref for (MorphItem morphItem : morph) { Word w = morphItem.getWord(); if (!openlex && !_stems.containsKey(w.getStem() + w.getPOS()) && !_posToEntries.containsKey(w.getPOS())) { System.err.println( "Warning: no entries for stem '" + w.getStem() + "' and POS '" + w.getPOS() + "' found for word '" + w + "'"); } String[] macroNames = morphItem.getMacros(); for (int j = 0; j < macroNames.length; j++) { if (!_macroItems.containsKey(macroNames[j])) { System.err.println( "Warning: macro " + macroNames[j] + " not found for word '" + morphItem.getWord() + "'"); } } String[] excludedNames = morphItem.getExcluded(); for (int j = 0; j < excludedNames.length; j++) { if (!familyAndEntryNames.contains(excludedNames[j])) { System.err.println( "Warning: excluded family or entry '" + excludedNames[j] + "' not found for word '" + morphItem.getWord() + "'"); } } } }