// look up and apply coarts for given rels to each sign in result private void applyCoarts(List<String> coartRels, Collection<Sign> result) { List<Sign> inputSigns = new ArrayList<Sign>(result); result.clear(); List<Sign> outputSigns = new ArrayList<Sign>(inputSigns.size()); // for each rel, lookup coarts and apply to input signs, storing results in output signs for (Iterator<String> it = coartRels.iterator(); it.hasNext(); ) { String rel = it.next(); Collection<String> preds = (Collection<String>) _coartRelsToPreds.get(rel); if (preds == null) continue; // not expected Collection<Sign> coartResult = getSignsFromRelAndPreds(rel, preds); if (coartResult == null) continue; for (Iterator<Sign> it2 = coartResult.iterator(); it2.hasNext(); ) { Sign coartSign = it2.next(); // apply to each input for (int j = 0; j < inputSigns.size(); j++) { Sign sign = inputSigns.get(j); grammar.rules.applyCoart(sign, coartSign, outputSigns); } } // switch output to input for next iteration inputSigns.clear(); inputSigns.addAll(outputSigns); outputSigns.clear(); } // add results back result.addAll(inputSigns); }
/** * For a string of 1 or more surface words, return all of the lexical entries for each word as a * list of sign hashes. Tokenization is performed using the configured tokenizer. * * @param w the words in string format * @return a list of sign hashes * @exception LexException thrown if word not found */ public List<SignHash> getEntriesFromWords(String s) throws LexException { List<SignHash> entries = new ArrayList<SignHash>(); List<Word> words = tokenizer.tokenize(s); for (Iterator<Word> it = words.iterator(); it.hasNext(); ) { Word w = it.next(); SignHash signs = getSignsFromWord(w); if (signs.size() == 0) { throw new LexException("Word not in lexicon: \"" + w + "\""); } entries.add(signs); } return entries; }
/** Loads the lexicon and morph files. */ public void init(URL lexiconUrl, URL morphUrl) throws IOException { List<Family> lexicon = null; List<MorphItem> morph = null; List<MacroItem> macroModel = null; // load category families (lexicon), morph forms and macros lexicon = getLexicon(lexiconUrl); Pair<List<MorphItem>, List<MacroItem>> morphInfo = getMorph(morphUrl); morph = morphInfo.a; macroModel = morphInfo.b; // index words; also index stems to words, as default preds // store indexed coarticulation attrs too _words = new GroupMap<Word, MorphItem>(); _predToWords = new GroupMap<String, Word>(); _coartAttrs = new HashSet<String>(); _indexedCoartAttrs = new HashSet<String>(); for (MorphItem morphItem : morph) { Word surfaceWord = morphItem.getSurfaceWord(); _words.put(surfaceWord, morphItem); _predToWords.put(morphItem.getWord().getStem(), surfaceWord); if (morphItem.isCoart()) { Word indexingWord = morphItem.getCoartIndexingWord(); _words.put(indexingWord, morphItem); Pair<String, String> first = indexingWord.getSurfaceAttrValPairs().next(); _indexedCoartAttrs.add(first.a); for (Iterator<Pair<String, String>> it = surfaceWord.getSurfaceAttrValPairs(); it.hasNext(); ) { Pair<String, String> p = it.next(); _coartAttrs.add(p.a); } } } // index entries based on stem+pos _stems = new GroupMap<String, Object>(); _posToEntries = new GroupMap<String, EntriesItem[]>(); // index entries by supertag+pos, for supertagging _stagToEntries = new GroupMap<String, EntriesItem>(); // also index rels and coart rels to preds _relsToPreds = new GroupMap<String, String>(); _coartRelsToPreds = new GroupMap<String, String>(); // and gather list of attributes used per atomic category type _catsToAttrs = new GroupMap<String, String>(); _lfAttrs = new HashSet<String>(); // and remember family and ent, names, for checking excluded list on morph items HashSet<String> familyAndEntryNames = new HashSet<String>(); // index each family for (Family family : lexicon) { familyAndEntryNames.add(family.getName()); EntriesItem[] entries = family.getEntries(); DataItem[] data = family.getData(); // for generic use when we get an unknown stem // from the morphological analyzer if (!family.isClosed()) { _posToEntries.put(family.getPOS(), entries); } // scan through entries for (int j = 0; j < entries.length; j++) { // index EntriesItem eItem = entries[j]; _stagToEntries.put(eItem.getSupertag() + family.getPOS(), eItem); if (eItem.getStem().length() > 0) { _stems.put(eItem.getStem() + family.getPOS(), eItem); } try { // gather features eItem.getCat().forall(gatherAttrs); // record names familyAndEntryNames.add(eItem.getName()); familyAndEntryNames.add(eItem.getQualifiedName()); } catch (RuntimeException exc) { System.err.println("exception for: " + family.getName() + ": " + exc); } } // scan through data for (int j = 0; j < data.length; j++) { DataItem dItem = data[j]; _stems.put( dItem.getStem() + family.getPOS(), new Pair<DataItem, EntriesItem[]>(dItem, entries)); // index non-default preds to words if (!dItem.getStem().equals(dItem.getPred())) { Collection<Word> words = (Collection<Word>) _predToWords.get(dItem.getStem()); if (words == null) { if (!openlex) { System.out.print("Warning: couldn't find words for pred '"); System.out.println(dItem.getPred() + "' with stem '" + dItem.getStem() + "'"); } } else { for (Iterator<Word> it = words.iterator(); it.hasNext(); ) { _predToWords.put(dItem.getPred(), it.next()); } } } } // index rels to preds // nb: this covers relational (eg @x<GenRel>e) and featural (eg @e<tense>past) // elementary predications List<String> indexRels = new ArrayList<String>(3); String familyIndexRel = family.getIndexRel(); if (familyIndexRel.length() > 0) { indexRels.add(familyIndexRel); } for (int j = 0; j < entries.length; j++) { EntriesItem eItem = entries[j]; String indexRel = eItem.getIndexRel(); if (indexRel.length() > 0 && !indexRel.equals(familyIndexRel)) { indexRels.add(indexRel); } } for (Iterator<String> it = indexRels.iterator(); it.hasNext(); ) { String indexRel = it.next(); // nb: not indexing on entries items, b/c some stems are still defaults for (int j = 0; j < data.length; j++) { DataItem dItem = data[j]; _relsToPreds.put(indexRel, dItem.getPred()); } } // index coart rels (features, really) to preds String coartRel = family.getCoartRel(); if (coartRel.length() > 0) { for (int j = 0; j < data.length; j++) { _coartRelsToPreds.put(coartRel, data[j].getPred()); } } } // index the macros _macros = new GroupMap<String, FeatureStructure>(); // nb: could just index MacroItem objects for feature structures too; // this might be a bit cleaner, but life is short _macroItems = new HashMap<String, MacroItem>(); for (MacroItem mi : macroModel) { String macName = mi.getName(); FeatureStructure[] specs = mi.getFeatureStructures(); for (int j = 0; j < specs.length; j++) { _macros.put(macName, specs[j]); } // this is for handling LF part of macros _macroItems.put(macName, mi); } // with morph items, check POS, macro names, excluded list for xref for (MorphItem morphItem : morph) { Word w = morphItem.getWord(); if (!openlex && !_stems.containsKey(w.getStem() + w.getPOS()) && !_posToEntries.containsKey(w.getPOS())) { System.err.println( "Warning: no entries for stem '" + w.getStem() + "' and POS '" + w.getPOS() + "' found for word '" + w + "'"); } String[] macroNames = morphItem.getMacros(); for (int j = 0; j < macroNames.length; j++) { if (!_macroItems.containsKey(macroNames[j])) { System.err.println( "Warning: macro " + macroNames[j] + " not found for word '" + morphItem.getWord() + "'"); } } String[] excludedNames = morphItem.getExcluded(); for (int j = 0; j < excludedNames.length; j++) { if (!familyAndEntryNames.contains(excludedNames[j])) { System.err.println( "Warning: excluded family or entry '" + excludedNames[j] + "' not found for word '" + morphItem.getWord() + "'"); } } } }