Beispiel #1
0
  /** Loads the lexicon and morph files. */
  public void init(URL lexiconUrl, URL morphUrl) throws IOException {

    List<Family> lexicon = null;
    List<MorphItem> morph = null;
    List<MacroItem> macroModel = null;

    // load category families (lexicon), morph forms and macros
    lexicon = getLexicon(lexiconUrl);
    Pair<List<MorphItem>, List<MacroItem>> morphInfo = getMorph(morphUrl);
    morph = morphInfo.a;
    macroModel = morphInfo.b;

    // index words; also index stems to words, as default preds
    // store indexed coarticulation attrs too
    _words = new GroupMap<Word, MorphItem>();
    _predToWords = new GroupMap<String, Word>();
    _coartAttrs = new HashSet<String>();
    _indexedCoartAttrs = new HashSet<String>();
    for (MorphItem morphItem : morph) {
      Word surfaceWord = morphItem.getSurfaceWord();
      _words.put(surfaceWord, morphItem);
      _predToWords.put(morphItem.getWord().getStem(), surfaceWord);
      if (morphItem.isCoart()) {
        Word indexingWord = morphItem.getCoartIndexingWord();
        _words.put(indexingWord, morphItem);
        Pair<String, String> first = indexingWord.getSurfaceAttrValPairs().next();
        _indexedCoartAttrs.add(first.a);
        for (Iterator<Pair<String, String>> it = surfaceWord.getSurfaceAttrValPairs();
            it.hasNext(); ) {
          Pair<String, String> p = it.next();
          _coartAttrs.add(p.a);
        }
      }
    }

    // index entries based on stem+pos
    _stems = new GroupMap<String, Object>();
    _posToEntries = new GroupMap<String, EntriesItem[]>();
    // index entries by supertag+pos, for supertagging
    _stagToEntries = new GroupMap<String, EntriesItem>();
    // also index rels and coart rels to preds
    _relsToPreds = new GroupMap<String, String>();
    _coartRelsToPreds = new GroupMap<String, String>();
    // and gather list of attributes used per atomic category type
    _catsToAttrs = new GroupMap<String, String>();
    _lfAttrs = new HashSet<String>();
    // and remember family and ent, names, for checking excluded list on morph items
    HashSet<String> familyAndEntryNames = new HashSet<String>();

    // index each family
    for (Family family : lexicon) {

      familyAndEntryNames.add(family.getName());
      EntriesItem[] entries = family.getEntries();
      DataItem[] data = family.getData();

      // for generic use when we get an unknown stem
      // from the morphological analyzer
      if (!family.isClosed()) {
        _posToEntries.put(family.getPOS(), entries);
      }

      // scan through entries
      for (int j = 0; j < entries.length; j++) {
        // index
        EntriesItem eItem = entries[j];
        _stagToEntries.put(eItem.getSupertag() + family.getPOS(), eItem);
        if (eItem.getStem().length() > 0) {
          _stems.put(eItem.getStem() + family.getPOS(), eItem);
        }
        try {
          // gather features
          eItem.getCat().forall(gatherAttrs);
          // record names
          familyAndEntryNames.add(eItem.getName());
          familyAndEntryNames.add(eItem.getQualifiedName());
        } catch (RuntimeException exc) {
          System.err.println("exception for: " + family.getName() + ": " + exc);
        }
      }

      // scan through data
      for (int j = 0; j < data.length; j++) {
        DataItem dItem = data[j];
        _stems.put(
            dItem.getStem() + family.getPOS(), new Pair<DataItem, EntriesItem[]>(dItem, entries));
        // index non-default preds to words
        if (!dItem.getStem().equals(dItem.getPred())) {
          Collection<Word> words = (Collection<Word>) _predToWords.get(dItem.getStem());
          if (words == null) {
            if (!openlex) {
              System.out.print("Warning: couldn't find words for pred '");
              System.out.println(dItem.getPred() + "' with stem '" + dItem.getStem() + "'");
            }
          } else {
            for (Iterator<Word> it = words.iterator(); it.hasNext(); ) {
              _predToWords.put(dItem.getPred(), it.next());
            }
          }
        }
      }

      // index rels to preds
      // nb: this covers relational (eg @x<GenRel>e) and featural (eg @e<tense>past)
      //     elementary predications
      List<String> indexRels = new ArrayList<String>(3);
      String familyIndexRel = family.getIndexRel();
      if (familyIndexRel.length() > 0) {
        indexRels.add(familyIndexRel);
      }
      for (int j = 0; j < entries.length; j++) {
        EntriesItem eItem = entries[j];
        String indexRel = eItem.getIndexRel();
        if (indexRel.length() > 0 && !indexRel.equals(familyIndexRel)) {
          indexRels.add(indexRel);
        }
      }
      for (Iterator<String> it = indexRels.iterator(); it.hasNext(); ) {
        String indexRel = it.next();
        // nb: not indexing on entries items, b/c some stems are still defaults
        for (int j = 0; j < data.length; j++) {
          DataItem dItem = data[j];
          _relsToPreds.put(indexRel, dItem.getPred());
        }
      }

      // index coart rels (features, really) to preds
      String coartRel = family.getCoartRel();
      if (coartRel.length() > 0) {
        for (int j = 0; j < data.length; j++) {
          _coartRelsToPreds.put(coartRel, data[j].getPred());
        }
      }
    }

    // index the macros
    _macros = new GroupMap<String, FeatureStructure>();
    // nb: could just index MacroItem objects for feature structures too;
    //     this might be a bit cleaner, but life is short
    _macroItems = new HashMap<String, MacroItem>();
    for (MacroItem mi : macroModel) {
      String macName = mi.getName();
      FeatureStructure[] specs = mi.getFeatureStructures();
      for (int j = 0; j < specs.length; j++) {
        _macros.put(macName, specs[j]);
      }
      // this is for handling LF part of macros
      _macroItems.put(macName, mi);
    }

    // with morph items, check POS, macro names, excluded list for xref
    for (MorphItem morphItem : morph) {
      Word w = morphItem.getWord();
      if (!openlex
          && !_stems.containsKey(w.getStem() + w.getPOS())
          && !_posToEntries.containsKey(w.getPOS())) {
        System.err.println(
            "Warning: no entries for stem '"
                + w.getStem()
                + "' and POS '"
                + w.getPOS()
                + "' found for word '"
                + w
                + "'");
      }
      String[] macroNames = morphItem.getMacros();
      for (int j = 0; j < macroNames.length; j++) {
        if (!_macroItems.containsKey(macroNames[j])) {
          System.err.println(
              "Warning: macro "
                  + macroNames[j]
                  + " not found for word '"
                  + morphItem.getWord()
                  + "'");
        }
      }
      String[] excludedNames = morphItem.getExcluded();
      for (int j = 0; j < excludedNames.length; j++) {
        if (!familyAndEntryNames.contains(excludedNames[j])) {
          System.err.println(
              "Warning: excluded family or entry '"
                  + excludedNames[j]
                  + "' not found for word '"
                  + morphItem.getWord()
                  + "'");
        }
      }
    }
  }
Beispiel #2
0
  // given MorphItem
  private void getWithMorphItem(
      Word w, MorphItem mi, String targetPred, String targetRel, SignHash result)
      throws LexException {
    // get supertags for filtering, if a supertagger is installed
    Map<String, Double> supertags = null;
    Set<String> supertagsFound = null;
    if (_supertagger != null) {
      supertags = _supertagger.getSupertags();
      if (supertags != null) supertagsFound = new HashSet<String>(supertags.size());
    }

    // get macro adder
    MacroAdder macAdder = getMacAdder(mi);

    // if we have this stem in our lexicon
    String stem = mi.getWord().getStem();
    String pos = mi.getWord().getPOS();
    Set<EntriesItem[]> explicitEntries =
        null; // for storing entries from explicitly listed family members
    if (_stems.containsKey(stem + pos)) {
      explicitEntries = new HashSet<EntriesItem[]>();
      Collection<Object> stemItems = (Collection<Object>) _stems.get(stem + pos);
      for (Iterator<Object> I = stemItems.iterator(); I.hasNext(); ) {
        Object item = I.next();
        // see if it's an EntriesItem
        if (item instanceof EntriesItem) {
          EntriesItem entry = (EntriesItem) item;
          // do lookup
          getWithEntriesItem(
              w,
              mi,
              stem,
              stem,
              targetPred,
              targetRel,
              entry,
              macAdder,
              supertags,
              supertagsFound,
              result);
        }
        // otherwise it has to be a Pair containing a DataItem and
        // an EntriesItem[]
        else {
          @SuppressWarnings("rawtypes")
          DataItem dItem = (DataItem) ((Pair) item).a;
          @SuppressWarnings("rawtypes")
          EntriesItem[] entries = (EntriesItem[]) ((Pair) item).b;
          // store entries
          explicitEntries.add(entries);
          // do lookup
          getWithDataItem(
              w,
              mi,
              dItem,
              entries,
              targetPred,
              targetRel,
              macAdder,
              supertags,
              supertagsFound,
              result);
        }
      }
    }

    // for entries that are not explicitly in the lexicon file, we have to create
    // Signs from the open class entries with the appropriate part-of-speech
    Collection<EntriesItem[]> entrySets = (Collection<EntriesItem[]>) _posToEntries.get(pos);
    if (entrySets != null) {
      for (Iterator<EntriesItem[]> E = entrySets.iterator(); E.hasNext(); ) {
        EntriesItem[] entries = E.next();
        // skip if entries explicitly listed
        if (explicitEntries != null && explicitEntries.contains(entries)) continue;
        // otherwise get entries with pred = targetPred, or stem if null
        String pred = (targetPred != null) ? targetPred : stem;
        getWithDataItem(
            w,
            mi,
            new DataItem(stem, pred),
            entries,
            targetPred,
            targetRel,
            macAdder,
            supertags,
            supertagsFound,
            result);
      }
    }

    // finally do entries for any remaining supertags
    if (supertags != null) {
      for (String supertag : supertags.keySet()) {
        if (supertagsFound.contains(supertag)) continue;
        Set<EntriesItem> entries = _stagToEntries.get(supertag + pos);
        if (entries == null) continue; // nb: could be a POS mismatch
        // get entries with pred = targetPred, or stem if null
        String pred = (targetPred != null) ? targetPred : stem;
        for (EntriesItem entry : entries) {
          if (!entry.getStem().equals(DEFAULT_VAL)) continue;
          getWithEntriesItem(
              w,
              mi,
              stem,
              pred,
              targetPred,
              targetRel,
              entry,
              macAdder,
              supertags,
              supertagsFound,
              result);
        }
      }
    }
  }
Beispiel #3
0
 /**
  * Returns whether the given rel (semantic feature, really) is one used to signal coarticulation.
  */
 public boolean isCoartRel(String rel) {
   return _coartRelsToPreds.containsKey(rel);
 }