/** Loads the lexicon and morph files. */ public void init(URL lexiconUrl, URL morphUrl) throws IOException { List<Family> lexicon = null; List<MorphItem> morph = null; List<MacroItem> macroModel = null; // load category families (lexicon), morph forms and macros lexicon = getLexicon(lexiconUrl); Pair<List<MorphItem>, List<MacroItem>> morphInfo = getMorph(morphUrl); morph = morphInfo.a; macroModel = morphInfo.b; // index words; also index stems to words, as default preds // store indexed coarticulation attrs too _words = new GroupMap<Word, MorphItem>(); _predToWords = new GroupMap<String, Word>(); _coartAttrs = new HashSet<String>(); _indexedCoartAttrs = new HashSet<String>(); for (MorphItem morphItem : morph) { Word surfaceWord = morphItem.getSurfaceWord(); _words.put(surfaceWord, morphItem); _predToWords.put(morphItem.getWord().getStem(), surfaceWord); if (morphItem.isCoart()) { Word indexingWord = morphItem.getCoartIndexingWord(); _words.put(indexingWord, morphItem); Pair<String, String> first = indexingWord.getSurfaceAttrValPairs().next(); _indexedCoartAttrs.add(first.a); for (Iterator<Pair<String, String>> it = surfaceWord.getSurfaceAttrValPairs(); it.hasNext(); ) { Pair<String, String> p = it.next(); _coartAttrs.add(p.a); } } } // index entries based on stem+pos _stems = new GroupMap<String, Object>(); _posToEntries = new GroupMap<String, EntriesItem[]>(); // index entries by supertag+pos, for supertagging _stagToEntries = new GroupMap<String, EntriesItem>(); // also index rels and coart rels to preds _relsToPreds = new GroupMap<String, String>(); _coartRelsToPreds = new GroupMap<String, String>(); // and gather list of attributes used per atomic category type _catsToAttrs = new GroupMap<String, String>(); _lfAttrs = new HashSet<String>(); // and remember family and ent, names, for checking excluded list on morph items HashSet<String> familyAndEntryNames = new HashSet<String>(); // index each family for (Family family : lexicon) { familyAndEntryNames.add(family.getName()); EntriesItem[] entries = family.getEntries(); DataItem[] data = family.getData(); // for generic use when we get an unknown stem // from the morphological analyzer if (!family.isClosed()) { _posToEntries.put(family.getPOS(), entries); } // scan through entries for (int j = 0; j < entries.length; j++) { // index EntriesItem eItem = entries[j]; _stagToEntries.put(eItem.getSupertag() + family.getPOS(), eItem); if (eItem.getStem().length() > 0) { _stems.put(eItem.getStem() + family.getPOS(), eItem); } try { // gather features eItem.getCat().forall(gatherAttrs); // record names familyAndEntryNames.add(eItem.getName()); familyAndEntryNames.add(eItem.getQualifiedName()); } catch (RuntimeException exc) { System.err.println("exception for: " + family.getName() + ": " + exc); } } // scan through data for (int j = 0; j < data.length; j++) { DataItem dItem = data[j]; _stems.put( dItem.getStem() + family.getPOS(), new Pair<DataItem, EntriesItem[]>(dItem, entries)); // index non-default preds to words if (!dItem.getStem().equals(dItem.getPred())) { Collection<Word> words = (Collection<Word>) _predToWords.get(dItem.getStem()); if (words == null) { if (!openlex) { System.out.print("Warning: couldn't find words for pred '"); System.out.println(dItem.getPred() + "' with stem '" + dItem.getStem() + "'"); } } else { for (Iterator<Word> it = words.iterator(); it.hasNext(); ) { _predToWords.put(dItem.getPred(), it.next()); } } } } // index rels to preds // nb: this covers relational (eg @x<GenRel>e) and featural (eg @e<tense>past) // elementary predications List<String> indexRels = new ArrayList<String>(3); String familyIndexRel = family.getIndexRel(); if (familyIndexRel.length() > 0) { indexRels.add(familyIndexRel); } for (int j = 0; j < entries.length; j++) { EntriesItem eItem = entries[j]; String indexRel = eItem.getIndexRel(); if (indexRel.length() > 0 && !indexRel.equals(familyIndexRel)) { indexRels.add(indexRel); } } for (Iterator<String> it = indexRels.iterator(); it.hasNext(); ) { String indexRel = it.next(); // nb: not indexing on entries items, b/c some stems are still defaults for (int j = 0; j < data.length; j++) { DataItem dItem = data[j]; _relsToPreds.put(indexRel, dItem.getPred()); } } // index coart rels (features, really) to preds String coartRel = family.getCoartRel(); if (coartRel.length() > 0) { for (int j = 0; j < data.length; j++) { _coartRelsToPreds.put(coartRel, data[j].getPred()); } } } // index the macros _macros = new GroupMap<String, FeatureStructure>(); // nb: could just index MacroItem objects for feature structures too; // this might be a bit cleaner, but life is short _macroItems = new HashMap<String, MacroItem>(); for (MacroItem mi : macroModel) { String macName = mi.getName(); FeatureStructure[] specs = mi.getFeatureStructures(); for (int j = 0; j < specs.length; j++) { _macros.put(macName, specs[j]); } // this is for handling LF part of macros _macroItems.put(macName, mi); } // with morph items, check POS, macro names, excluded list for xref for (MorphItem morphItem : morph) { Word w = morphItem.getWord(); if (!openlex && !_stems.containsKey(w.getStem() + w.getPOS()) && !_posToEntries.containsKey(w.getPOS())) { System.err.println( "Warning: no entries for stem '" + w.getStem() + "' and POS '" + w.getPOS() + "' found for word '" + w + "'"); } String[] macroNames = morphItem.getMacros(); for (int j = 0; j < macroNames.length; j++) { if (!_macroItems.containsKey(macroNames[j])) { System.err.println( "Warning: macro " + macroNames[j] + " not found for word '" + morphItem.getWord() + "'"); } } String[] excludedNames = morphItem.getExcluded(); for (int j = 0; j < excludedNames.length; j++) { if (!familyAndEntryNames.contains(excludedNames[j])) { System.err.println( "Warning: excluded family or entry '" + excludedNames[j] + "' not found for word '" + morphItem.getWord() + "'"); } } } }
// given MorphItem private void getWithMorphItem( Word w, MorphItem mi, String targetPred, String targetRel, SignHash result) throws LexException { // get supertags for filtering, if a supertagger is installed Map<String, Double> supertags = null; Set<String> supertagsFound = null; if (_supertagger != null) { supertags = _supertagger.getSupertags(); if (supertags != null) supertagsFound = new HashSet<String>(supertags.size()); } // get macro adder MacroAdder macAdder = getMacAdder(mi); // if we have this stem in our lexicon String stem = mi.getWord().getStem(); String pos = mi.getWord().getPOS(); Set<EntriesItem[]> explicitEntries = null; // for storing entries from explicitly listed family members if (_stems.containsKey(stem + pos)) { explicitEntries = new HashSet<EntriesItem[]>(); Collection<Object> stemItems = (Collection<Object>) _stems.get(stem + pos); for (Iterator<Object> I = stemItems.iterator(); I.hasNext(); ) { Object item = I.next(); // see if it's an EntriesItem if (item instanceof EntriesItem) { EntriesItem entry = (EntriesItem) item; // do lookup getWithEntriesItem( w, mi, stem, stem, targetPred, targetRel, entry, macAdder, supertags, supertagsFound, result); } // otherwise it has to be a Pair containing a DataItem and // an EntriesItem[] else { @SuppressWarnings("rawtypes") DataItem dItem = (DataItem) ((Pair) item).a; @SuppressWarnings("rawtypes") EntriesItem[] entries = (EntriesItem[]) ((Pair) item).b; // store entries explicitEntries.add(entries); // do lookup getWithDataItem( w, mi, dItem, entries, targetPred, targetRel, macAdder, supertags, supertagsFound, result); } } } // for entries that are not explicitly in the lexicon file, we have to create // Signs from the open class entries with the appropriate part-of-speech Collection<EntriesItem[]> entrySets = (Collection<EntriesItem[]>) _posToEntries.get(pos); if (entrySets != null) { for (Iterator<EntriesItem[]> E = entrySets.iterator(); E.hasNext(); ) { EntriesItem[] entries = E.next(); // skip if entries explicitly listed if (explicitEntries != null && explicitEntries.contains(entries)) continue; // otherwise get entries with pred = targetPred, or stem if null String pred = (targetPred != null) ? targetPred : stem; getWithDataItem( w, mi, new DataItem(stem, pred), entries, targetPred, targetRel, macAdder, supertags, supertagsFound, result); } } // finally do entries for any remaining supertags if (supertags != null) { for (String supertag : supertags.keySet()) { if (supertagsFound.contains(supertag)) continue; Set<EntriesItem> entries = _stagToEntries.get(supertag + pos); if (entries == null) continue; // nb: could be a POS mismatch // get entries with pred = targetPred, or stem if null String pred = (targetPred != null) ? targetPred : stem; for (EntriesItem entry : entries) { if (!entry.getStem().equals(DEFAULT_VAL)) continue; getWithEntriesItem( w, mi, stem, pred, targetPred, targetRel, entry, macAdder, supertags, supertagsFound, result); } } } }
/** * Returns whether the given rel (semantic feature, really) is one used to signal coarticulation. */ public boolean isCoartRel(String rel) { return _coartRelsToPreds.containsKey(rel); }