コード例 #1
0
ファイル: Lexicon.java プロジェクト: kevinkissi/openccg
  // returns a macro adder for the given morph item
  private MacroAdder getMacAdder(MorphItem mi) {

    // check map
    MacroAdder retval = macAdderMap.get(mi);
    if (retval != null) return retval;

    // set up macro adder
    IntHashSetMap macrosFromLex = new IntHashSetMap();
    String[] newMacroNames = mi.getMacros();
    List<MacroItem> macroItems = new ArrayList<MacroItem>();
    for (int i = 0; i < newMacroNames.length; i++) {
      Set<FeatureStructure> featStrucs = (Set<FeatureStructure>) _macros.get(newMacroNames[i]);
      if (featStrucs != null) {
        for (Iterator<FeatureStructure> fsIt = featStrucs.iterator(); fsIt.hasNext(); ) {
          FeatureStructure fs = fsIt.next();
          macrosFromLex.put(fs.getIndex(), fs);
        }
      }
      MacroItem macroItem = _macroItems.get(newMacroNames[i]);
      if (macroItem != null) {
        macroItems.add(macroItem);
      } else {
        // should be checked earlier too
        System.err.println(
            "Warning: macro " + newMacroNames[i] + " not found for word '" + mi.getWord() + "'");
      }
    }
    retval = new MacroAdder(macrosFromLex, macroItems);

    // update map and return
    macAdderMap.put(mi, retval);
    return retval;
  }
コード例 #2
0
ファイル: Lexicon.java プロジェクト: kevinkissi/openccg
 // look up and apply coarts for given rels to each sign in result
 private void applyCoarts(List<String> coartRels, Collection<Sign> result) {
   List<Sign> inputSigns = new ArrayList<Sign>(result);
   result.clear();
   List<Sign> outputSigns = new ArrayList<Sign>(inputSigns.size());
   // for each rel, lookup coarts and apply to input signs, storing results in output signs
   for (Iterator<String> it = coartRels.iterator(); it.hasNext(); ) {
     String rel = it.next();
     Collection<String> preds = (Collection<String>) _coartRelsToPreds.get(rel);
     if (preds == null) continue; // not expected
     Collection<Sign> coartResult = getSignsFromRelAndPreds(rel, preds);
     if (coartResult == null) continue;
     for (Iterator<Sign> it2 = coartResult.iterator(); it2.hasNext(); ) {
       Sign coartSign = it2.next();
       // apply to each input
       for (int j = 0; j < inputSigns.size(); j++) {
         Sign sign = inputSigns.get(j);
         grammar.rules.applyCoart(sign, coartSign, outputSigns);
       }
     }
     // switch output to input for next iteration
     inputSigns.clear();
     inputSigns.addAll(outputSigns);
     outputSigns.clear();
   }
   // add results back
   result.addAll(inputSigns);
 }
コード例 #3
0
ファイル: Lexicon.java プロジェクト: kevinkissi/openccg
 // get signs for rel via preds, or null if none
 private Collection<Sign> getSignsFromRelAndPreds(String rel, Collection<String> preds) {
   List<Sign> retval = new ArrayList<Sign>();
   for (Iterator<String> it = preds.iterator(); it.hasNext(); ) {
     String pred = it.next();
     Collection<Sign> signs = getSignsFromPredAndTargetRel(pred, rel);
     if (signs != null) retval.addAll(signs);
   }
   // return null if none survive filter
   if (retval.size() > 0) return retval;
   else return null;
 }
コード例 #4
0
ファイル: Lexicon.java プロジェクト: kevinkissi/openccg
  // get signs using an additional arg for a target rel
  private Collection<Sign> getSignsFromPredAndTargetRel(String pred, String targetRel) {

    Collection<Word> words = (Collection<Word>) _predToWords.get(pred);
    String specialTokenConst = null;

    // for robustness, when using supertagger, add words for pred sans sense index
    int dotIndex = -1;
    if (_supertagger != null
        && !Character.isDigit(pred.charAt(0))
        && // skip numbers
        (dotIndex = pred.lastIndexOf('.')) > 0
        && pred.length() > dotIndex + 1
        && pred.charAt(dotIndex + 1) != '_') // skip titles, eg Mr._Smith
    {
      String barePred = pred.substring(0, dotIndex);
      Collection<Word> barePredWords = (Collection<Word>) _predToWords.get(barePred);
      if (words == null) words = barePredWords;
      else if (barePredWords != null) {
        Set<Word> unionWords = new HashSet<Word>(words);
        unionWords.addAll(barePredWords);
        words = unionWords;
      }
    }

    if (words == null) {
      specialTokenConst = tokenizer.getSpecialTokenConstant(tokenizer.isSpecialToken(pred));
      if (specialTokenConst == null) return null;
      // lookup words with pred = special token const
      Collection<Word> specialTokenWords = (Collection<Word>) _predToWords.get(specialTokenConst);
      // replace special token const with pred
      if (specialTokenWords == null) return null;
      words = new ArrayList<Word>(specialTokenWords.size());
      for (Iterator<Word> it = specialTokenWords.iterator(); it.hasNext(); ) {
        Word stw = it.next();
        Word w = Word.createSurfaceWord(stw, pred);
        words.add(w);
      }
    }

    List<Sign> retval = new ArrayList<Sign>();
    for (Iterator<Word> it = words.iterator(); it.hasNext(); ) {
      Word w = it.next();
      try {
        SignHash signs = getSignsFromWord(w, specialTokenConst, pred, targetRel);
        retval.addAll(signs.asSignSet());
      }
      // shouldn't happen
      catch (LexException exc) {
        System.err.println("Unexpected lex exception for word " + w + ": " + exc);
      }
    }
    return retval;
  }
コード例 #5
0
ファイル: Lexicon.java プロジェクト: kevinkissi/openccg
 // get licensing features, with appropriate defaults
 @SuppressWarnings("unchecked")
 private void loadLicensingFeatures(Element licensingElt) {
   List<LicensingFeature> licensingFeats = new ArrayList<LicensingFeature>();
   boolean containsLexFeat = false;
   if (licensingElt != null) {
     for (Iterator<Element> it = licensingElt.getChildren("feat").iterator(); it.hasNext(); ) {
       Element featElt = it.next();
       String attr = featElt.getAttributeValue("attr");
       if (attr.equals("lex")) containsLexFeat = true;
       String val = featElt.getAttributeValue("val");
       List<String> alsoLicensedBy = null;
       String alsoVals = featElt.getAttributeValue("also-licensed-by");
       if (alsoVals != null) {
         alsoLicensedBy = Arrays.asList(alsoVals.split("\\s+"));
       }
       boolean licenseEmptyCats = true;
       boolean licenseMarkedCats = false;
       boolean instantiate = true;
       byte loc = LicensingFeature.BOTH;
       String lmc = featElt.getAttributeValue("license-marked-cats");
       if (lmc != null) {
         licenseMarkedCats = Boolean.valueOf(lmc).booleanValue();
         // change defaults
         licenseEmptyCats = false;
         loc = LicensingFeature.TARGET_ONLY;
         instantiate = false;
       }
       String lec = featElt.getAttributeValue("license-empty-cats");
       if (lec != null) {
         licenseEmptyCats = Boolean.valueOf(lec).booleanValue();
       }
       String inst = featElt.getAttributeValue("instantiate");
       if (inst != null) {
         instantiate = Boolean.valueOf(inst).booleanValue();
       }
       String locStr = featElt.getAttributeValue("location");
       if (locStr != null) {
         if (locStr.equals("target-only")) loc = LicensingFeature.TARGET_ONLY;
         if (locStr.equals("args-only")) loc = LicensingFeature.ARGS_ONLY;
         if (locStr.equals("both")) loc = LicensingFeature.BOTH;
       }
       licensingFeats.add(
           new LicensingFeature(
               attr, val, alsoLicensedBy, licenseEmptyCats, licenseMarkedCats, instantiate, loc));
     }
   }
   if (!containsLexFeat) {
     licensingFeats.add(LicensingFeature.defaultLexFeature);
   }
   _licensingFeatures = new LicensingFeature[licensingFeats.size()];
   licensingFeats.toArray(_licensingFeatures);
 }
コード例 #6
0
ファイル: Lexicon.java プロジェクト: kevinkissi/openccg
 /**
  * For a string of 1 or more surface words, return all of the lexical entries for each word as a
  * list of sign hashes. Tokenization is performed using the configured tokenizer.
  *
  * @param w the words in string format
  * @return a list of sign hashes
  * @exception LexException thrown if word not found
  */
 public List<SignHash> getEntriesFromWords(String s) throws LexException {
   List<SignHash> entries = new ArrayList<SignHash>();
   List<Word> words = tokenizer.tokenize(s);
   for (Iterator<Word> it = words.iterator(); it.hasNext(); ) {
     Word w = it.next();
     SignHash signs = getSignsFromWord(w);
     if (signs.size() == 0) {
       throw new LexException("Word not in lexicon: \"" + w + "\"");
     }
     entries.add(signs);
   }
   return entries;
 }
コード例 #7
0
ファイル: Lexicon.java プロジェクト: kevinkissi/openccg
 // look up and apply coarts for w to each sign in result
 @SuppressWarnings("unchecked")
 private void applyCoarts(Word w, SignHash result) throws LexException {
   List<Sign> inputSigns = new ArrayList<Sign>(result.asSignSet());
   result.clear();
   List<Sign> outputSigns = new ArrayList<Sign>(inputSigns.size());
   // for each surface attr, lookup coarts and apply to input signs, storing results in output
   // signs
   for (Iterator<Pair<String, String>> it = w.getSurfaceAttrValPairs(); it.hasNext(); ) {
     Pair<String, String> p = it.next();
     String attr = (String) p.a;
     if (!_indexedCoartAttrs.contains(attr)) continue;
     String val = (String) p.b;
     Word coartWord = Word.createWord(attr, val);
     SignHash coartResult = getSignsFromWord(coartWord, null, null, null);
     for (Iterator<Sign> it2 = coartResult.iterator(); it2.hasNext(); ) {
       Sign coartSign = it2.next();
       // apply to each input
       for (int j = 0; j < inputSigns.size(); j++) {
         Sign sign = inputSigns.get(j);
         grammar.rules.applyCoart(sign, coartSign, outputSigns);
       }
     }
     // switch output to input for next iteration
     inputSigns.clear();
     inputSigns.addAll(outputSigns);
     outputSigns.clear();
   }
   // add results back
   result.addAll(inputSigns);
 }
コード例 #8
0
ファイル: Lexicon.java プロジェクト: kevinkissi/openccg
 public boolean equals(Object obj) {
   if (!(obj instanceof PredLookup)) return false;
   PredLookup pLook = (PredLookup) obj;
   if (!pred.equals(pLook.pred)) return false;
   if (coartRels == null) return (pLook.coartRels == null);
   return coartRels.equals(pLook.coartRels);
 }
コード例 #9
0
ファイル: Lexicon.java プロジェクト: kevinkissi/openccg
 public void handleElement(Element e) {
   // create morph item
   if (e.getName().equals("entry")) {
     try {
       morphItems.add(new MorphItem(e));
     } catch (RuntimeException exc) {
       System.err.println("Skipping morph item: " + e.getAttributeValue("word"));
       System.err.println(exc.toString());
     }
   }
   // create macro item
   else if (e.getName().equals("macro")) {
     try {
       macroItems.add(new MacroItem(e));
     } catch (RuntimeException exc) {
       System.err.println("Skipping macro item: " + e.getAttributeValue("name"));
       System.err.println(exc.toString());
     }
   }
 }
コード例 #10
0
ファイル: Lexicon.java プロジェクト: kevinkissi/openccg
 public void handleElement(Element e) {
   // create family
   if (e.getName().equals("family")) {
     try {
       lexicon.add(new Family(e));
     } catch (RuntimeException exc) {
       System.err.println("Skipping family: " + e.getAttributeValue("name"));
       System.err.println(exc.toString());
     }
   }
   // save distributive attributes
   else if (e.getName().equals("distributive-features")) distrElt = e;
   // save licensing features
   else if (e.getName().equals("licensing-features")) licensingElt = e;
   // save relation sort order
   else if (e.getName().equals("relation-sorting")) relationSortingElt = e;
 }
コード例 #11
0
ファイル: Lexicon.java プロジェクト: kevinkissi/openccg
  /** Loads the lexicon and morph files. */
  public void init(URL lexiconUrl, URL morphUrl) throws IOException {

    List<Family> lexicon = null;
    List<MorphItem> morph = null;
    List<MacroItem> macroModel = null;

    // load category families (lexicon), morph forms and macros
    lexicon = getLexicon(lexiconUrl);
    Pair<List<MorphItem>, List<MacroItem>> morphInfo = getMorph(morphUrl);
    morph = morphInfo.a;
    macroModel = morphInfo.b;

    // index words; also index stems to words, as default preds
    // store indexed coarticulation attrs too
    _words = new GroupMap<Word, MorphItem>();
    _predToWords = new GroupMap<String, Word>();
    _coartAttrs = new HashSet<String>();
    _indexedCoartAttrs = new HashSet<String>();
    for (MorphItem morphItem : morph) {
      Word surfaceWord = morphItem.getSurfaceWord();
      _words.put(surfaceWord, morphItem);
      _predToWords.put(morphItem.getWord().getStem(), surfaceWord);
      if (morphItem.isCoart()) {
        Word indexingWord = morphItem.getCoartIndexingWord();
        _words.put(indexingWord, morphItem);
        Pair<String, String> first = indexingWord.getSurfaceAttrValPairs().next();
        _indexedCoartAttrs.add(first.a);
        for (Iterator<Pair<String, String>> it = surfaceWord.getSurfaceAttrValPairs();
            it.hasNext(); ) {
          Pair<String, String> p = it.next();
          _coartAttrs.add(p.a);
        }
      }
    }

    // index entries based on stem+pos
    _stems = new GroupMap<String, Object>();
    _posToEntries = new GroupMap<String, EntriesItem[]>();
    // index entries by supertag+pos, for supertagging
    _stagToEntries = new GroupMap<String, EntriesItem>();
    // also index rels and coart rels to preds
    _relsToPreds = new GroupMap<String, String>();
    _coartRelsToPreds = new GroupMap<String, String>();
    // and gather list of attributes used per atomic category type
    _catsToAttrs = new GroupMap<String, String>();
    _lfAttrs = new HashSet<String>();
    // and remember family and ent, names, for checking excluded list on morph items
    HashSet<String> familyAndEntryNames = new HashSet<String>();

    // index each family
    for (Family family : lexicon) {

      familyAndEntryNames.add(family.getName());
      EntriesItem[] entries = family.getEntries();
      DataItem[] data = family.getData();

      // for generic use when we get an unknown stem
      // from the morphological analyzer
      if (!family.isClosed()) {
        _posToEntries.put(family.getPOS(), entries);
      }

      // scan through entries
      for (int j = 0; j < entries.length; j++) {
        // index
        EntriesItem eItem = entries[j];
        _stagToEntries.put(eItem.getSupertag() + family.getPOS(), eItem);
        if (eItem.getStem().length() > 0) {
          _stems.put(eItem.getStem() + family.getPOS(), eItem);
        }
        try {
          // gather features
          eItem.getCat().forall(gatherAttrs);
          // record names
          familyAndEntryNames.add(eItem.getName());
          familyAndEntryNames.add(eItem.getQualifiedName());
        } catch (RuntimeException exc) {
          System.err.println("exception for: " + family.getName() + ": " + exc);
        }
      }

      // scan through data
      for (int j = 0; j < data.length; j++) {
        DataItem dItem = data[j];
        _stems.put(
            dItem.getStem() + family.getPOS(), new Pair<DataItem, EntriesItem[]>(dItem, entries));
        // index non-default preds to words
        if (!dItem.getStem().equals(dItem.getPred())) {
          Collection<Word> words = (Collection<Word>) _predToWords.get(dItem.getStem());
          if (words == null) {
            if (!openlex) {
              System.out.print("Warning: couldn't find words for pred '");
              System.out.println(dItem.getPred() + "' with stem '" + dItem.getStem() + "'");
            }
          } else {
            for (Iterator<Word> it = words.iterator(); it.hasNext(); ) {
              _predToWords.put(dItem.getPred(), it.next());
            }
          }
        }
      }

      // index rels to preds
      // nb: this covers relational (eg @x<GenRel>e) and featural (eg @e<tense>past)
      //     elementary predications
      List<String> indexRels = new ArrayList<String>(3);
      String familyIndexRel = family.getIndexRel();
      if (familyIndexRel.length() > 0) {
        indexRels.add(familyIndexRel);
      }
      for (int j = 0; j < entries.length; j++) {
        EntriesItem eItem = entries[j];
        String indexRel = eItem.getIndexRel();
        if (indexRel.length() > 0 && !indexRel.equals(familyIndexRel)) {
          indexRels.add(indexRel);
        }
      }
      for (Iterator<String> it = indexRels.iterator(); it.hasNext(); ) {
        String indexRel = it.next();
        // nb: not indexing on entries items, b/c some stems are still defaults
        for (int j = 0; j < data.length; j++) {
          DataItem dItem = data[j];
          _relsToPreds.put(indexRel, dItem.getPred());
        }
      }

      // index coart rels (features, really) to preds
      String coartRel = family.getCoartRel();
      if (coartRel.length() > 0) {
        for (int j = 0; j < data.length; j++) {
          _coartRelsToPreds.put(coartRel, data[j].getPred());
        }
      }
    }

    // index the macros
    _macros = new GroupMap<String, FeatureStructure>();
    // nb: could just index MacroItem objects for feature structures too;
    //     this might be a bit cleaner, but life is short
    _macroItems = new HashMap<String, MacroItem>();
    for (MacroItem mi : macroModel) {
      String macName = mi.getName();
      FeatureStructure[] specs = mi.getFeatureStructures();
      for (int j = 0; j < specs.length; j++) {
        _macros.put(macName, specs[j]);
      }
      // this is for handling LF part of macros
      _macroItems.put(macName, mi);
    }

    // with morph items, check POS, macro names, excluded list for xref
    for (MorphItem morphItem : morph) {
      Word w = morphItem.getWord();
      if (!openlex
          && !_stems.containsKey(w.getStem() + w.getPOS())
          && !_posToEntries.containsKey(w.getPOS())) {
        System.err.println(
            "Warning: no entries for stem '"
                + w.getStem()
                + "' and POS '"
                + w.getPOS()
                + "' found for word '"
                + w
                + "'");
      }
      String[] macroNames = morphItem.getMacros();
      for (int j = 0; j < macroNames.length; j++) {
        if (!_macroItems.containsKey(macroNames[j])) {
          System.err.println(
              "Warning: macro "
                  + macroNames[j]
                  + " not found for word '"
                  + morphItem.getWord()
                  + "'");
        }
      }
      String[] excludedNames = morphItem.getExcluded();
      for (int j = 0; j < excludedNames.length; j++) {
        if (!familyAndEntryNames.contains(excludedNames[j])) {
          System.err.println(
              "Warning: excluded family or entry '"
                  + excludedNames[j]
                  + "' not found for word '"
                  + morphItem.getWord()
                  + "'");
        }
      }
    }
  }
コード例 #12
0
ファイル: Lexicon.java プロジェクト: kevinkissi/openccg
 public int hashCode() {
   return pred.hashCode() + ((coartRels != null) ? coartRels.hashCode() : 0);
 }