// get signs with additional args for a known special token const, target pred and target rel private SignHash getSignsFromWord( Word w, String specialTokenConst, String targetPred, String targetRel) throws LexException { Collection<MorphItem> morphItems = (specialTokenConst == null) ? (Collection<MorphItem>) _words.get(w) : null; if (morphItems == null) { // check for special tokens if (specialTokenConst == null) { specialTokenConst = tokenizer.getSpecialTokenConstant(tokenizer.isSpecialToken(w.getForm())); targetPred = w.getForm(); } if (specialTokenConst != null) { Word key = Word.createSurfaceWord(w, specialTokenConst); morphItems = (Collection<MorphItem>) _words.get(key); } // otherwise throw lex exception if (morphItems == null) throw new LexException(w + " not in lexicon"); } SignHash result = new SignHash(); for (Iterator<MorphItem> MI = morphItems.iterator(); MI.hasNext(); ) { getWithMorphItem(w, MI.next(), targetPred, targetRel, result); } return result; }
// get signs using an additional arg for a target rel private Collection<Sign> getSignsFromPredAndTargetRel(String pred, String targetRel) { Collection<Word> words = (Collection<Word>) _predToWords.get(pred); String specialTokenConst = null; // for robustness, when using supertagger, add words for pred sans sense index int dotIndex = -1; if (_supertagger != null && !Character.isDigit(pred.charAt(0)) && // skip numbers (dotIndex = pred.lastIndexOf('.')) > 0 && pred.length() > dotIndex + 1 && pred.charAt(dotIndex + 1) != '_') // skip titles, eg Mr._Smith { String barePred = pred.substring(0, dotIndex); Collection<Word> barePredWords = (Collection<Word>) _predToWords.get(barePred); if (words == null) words = barePredWords; else if (barePredWords != null) { Set<Word> unionWords = new HashSet<Word>(words); unionWords.addAll(barePredWords); words = unionWords; } } if (words == null) { specialTokenConst = tokenizer.getSpecialTokenConstant(tokenizer.isSpecialToken(pred)); if (specialTokenConst == null) return null; // lookup words with pred = special token const Collection<Word> specialTokenWords = (Collection<Word>) _predToWords.get(specialTokenConst); // replace special token const with pred if (specialTokenWords == null) return null; words = new ArrayList<Word>(specialTokenWords.size()); for (Iterator<Word> it = specialTokenWords.iterator(); it.hasNext(); ) { Word stw = it.next(); Word w = Word.createSurfaceWord(stw, pred); words.add(w); } } List<Sign> retval = new ArrayList<Sign>(); for (Iterator<Word> it = words.iterator(); it.hasNext(); ) { Word w = it.next(); try { SignHash signs = getSignsFromWord(w, specialTokenConst, pred, targetRel); retval.addAll(signs.asSignSet()); } // shouldn't happen catch (LexException exc) { System.err.println("Unexpected lex exception for word " + w + ": " + exc); } } return retval; }
/** * For a string of 1 or more surface words, return all of the lexical entries for each word as a * list of sign hashes. Tokenization is performed using the configured tokenizer. * * @param w the words in string format * @return a list of sign hashes * @exception LexException thrown if word not found */ public List<SignHash> getEntriesFromWords(String s) throws LexException { List<SignHash> entries = new ArrayList<SignHash>(); List<Word> words = tokenizer.tokenize(s); for (Iterator<Word> it = words.iterator(); it.hasNext(); ) { Word w = it.next(); SignHash signs = getSignsFromWord(w); if (signs.size() == 0) { throw new LexException("Word not in lexicon: \"" + w + "\""); } entries.add(signs); } return entries; }