コード例 #1
0
ファイル: Run.java プロジェクト: biosemantics/charaparser
  private static void initGlossary(IGlossary glossary, IInflector inflector, TaxonGroup taxonGroup)
      throws IOException {
    OTOClient otoClient = new OTOClient("http://biosemantics.arizona.edu:8080/OTO");
    GlossaryDownload glossaryDownload = new GlossaryDownload();
    String glossaryVersion = "latest";
    otoClient.open();
    Future<GlossaryDownload> futureGlossaryDownload =
        otoClient.getGlossaryDownload(taxonGroup.getDisplayName(), glossaryVersion);

    try {
      glossaryDownload = futureGlossaryDownload.get();
    } catch (Exception e) {
      otoClient.close();
      e.printStackTrace();
    }
    otoClient.close();

    // add the syn set of the glossary
    HashSet<Term> gsyns = new HashSet<Term>();
    for (TermSynonym termSyn : glossaryDownload.getTermSynonyms()) {

      // if(termSyn.getCategory().compareTo("structure")==0){
      if (termSyn.getCategory().matches("structure|taxon_name|substance")) {
        // take care of singular and plural forms
        String syns = "";
        String synp = "";
        String terms = "";
        String termp = "";
        if (inflector.isPlural(
            termSyn
                .getSynonym()
                .replaceAll(
                    "_",
                    "-"))) { // must convert _ to -, as matching entity phrases will be converted
                             // from leg iii to leg-iii in the sentence.
          synp = termSyn.getSynonym().replaceAll("_", "-");
          syns = inflector.getSingular(synp);
        } else {
          syns = termSyn.getSynonym().replaceAll("_", "-");
          synp = inflector.getPlural(syns);
        }

        if (inflector.isPlural(termSyn.getTerm().replaceAll("_", "-"))) {
          termp = termSyn.getTerm().replaceAll("_", "-");
          terms = inflector.getSingular(termp);
        } else {
          terms = termSyn.getTerm().replaceAll("_", "-");
          termp = inflector.getPlural(terms);
        }
        glossary.addSynonym(syns, termSyn.getCategory(), terms);
        glossary.addSynonym(synp, termSyn.getCategory(), termp);
        gsyns.add(new Term(syns, termSyn.getCategory()));
        gsyns.add(new Term(synp, termSyn.getCategory()));
      } else {
        // glossary.addSynonym(termSyn.getSynonym().replaceAll("_",  "-"), "arrangement",
        // termSyn.getTerm());
        glossary.addSynonym(
            termSyn.getSynonym().replaceAll("_", "-"), termSyn.getCategory(), termSyn.getTerm());
        gsyns.add(new Term(termSyn.getSynonym().replaceAll("_", "-"), termSyn.getCategory()));
        // gsyns.add(new Term(termSyn.getSynonym().replaceAll("_",  "-"), "arrangement"));
      }
    }

    // the glossary, excluding gsyns
    for (TermCategory termCategory : glossaryDownload.getTermCategories()) {
      if (!gsyns.contains(
          new Term(termCategory.getTerm().replaceAll("_", "-"), termCategory.getCategory())))
        glossary.addEntry(
            termCategory.getTerm().replaceAll("_", "-"),
            termCategory.getCategory()); // primocane_foliage =>primocane-foliage Hong 3/2014
    }

    List<Synonym> synonyms = new LinkedList<Synonym>();
    CSVReader reader =
        new CSVReader(
            new FileReader(
                "C:\\Users\\rodenhausen\\Desktop\\test-enhance\\"
                    + "Gordon_complexity_term_review\\category_mainterm_synonymterm-task-Gordon_complexity.csv"));
    List<String[]> lines = reader.readAll();
    int i = 0;
    Set<String> hasSynonym = new HashSet<String>();
    for (String[] line : lines) {
      synonyms.add(new Synonym(String.valueOf(i), line[1], line[0], line[2]));
      hasSynonym.add(line[1]);
    }

    reader =
        new CSVReader(
            new FileReader(
                "C:\\Users\\rodenhausen\\Desktop\\test-enhance\\"
                    + "Gordon_complexity_term_review\\category_term-task-Gordon_complexity.csv"));
    lines = reader.readAll();
    List<Decision> decisions = new LinkedList<Decision>();
    i = 0;
    for (String[] line : lines) {
      decisions.add(
          new Decision(String.valueOf(i), line[1], line[0], hasSynonym.contains(line[1]), ""));
    }

    Download download = new Download(true, decisions, synonyms);

    // add syn set of term_category
    HashSet<Term> dsyns = new HashSet<Term>();
    if (download != null) {
      for (Synonym termSyn : download.getSynonyms()) {
        // Hong TODO need to add category info to synonym entry in OTOLite
        // if(termSyn.getCategory().compareTo("structure")==0){
        if (termSyn.getCategory().matches("structure|taxon_name|substance")) {
          // take care of singular and plural forms
          String syns = "";
          String synp = "";
          String terms = "";
          String termp = "";
          if (inflector.isPlural(termSyn.getSynonym().replaceAll("_", "-"))) {
            synp = termSyn.getSynonym().replaceAll("_", "-");
            syns = inflector.getSingular(synp);
          } else {
            syns = termSyn.getSynonym().replaceAll("_", "-");
            synp = inflector.getPlural(syns);
          }

          if (inflector.isPlural(termSyn.getTerm().replaceAll("_", "-"))) {
            termp = termSyn.getTerm().replaceAll("_", "-");
            terms = inflector.getSingular(termp);
          } else {
            terms = termSyn.getTerm().replaceAll("_", "-");
            termp = inflector.getPlural(terms);
          }
          // glossary.addSynonym(syns, termSyn.getCategory(), terms);
          // glossary.addSynonym(synp, termSyn.getCategory(), termp);
          // dsyns.add(new Term(syns, termSyn.getCategory());
          // dsyns.add(new Term(synp, termSyn.getCategory());
          glossary.addSynonym(syns, termSyn.getCategory(), terms);
          glossary.addSynonym(synp, termSyn.getCategory(), termp);
          dsyns.add(new Term(syns, termSyn.getCategory()));
          dsyns.add(new Term(synp, termSyn.getCategory()));
        } else { // forking_1 and forking are syns 5/5/14 hong test, shouldn't _1 have already been
                 // removed?
          glossary.addSynonym(
              termSyn.getSynonym().replaceAll("_", "-"), termSyn.getCategory(), termSyn.getTerm());
          dsyns.add(new Term(termSyn.getSynonym().replaceAll("_", "-"), termSyn.getCategory()));
        }
      }

      // term_category from OTO, excluding dsyns
      for (Decision decision : download.getDecisions()) {
        if (!dsyns.contains(
            new Term(
                decision.getTerm().replaceAll("_", "-"),
                decision.getCategory()))) // calyx_tube => calyx-tube
        glossary.addEntry(decision.getTerm().replaceAll("_", "-"), decision.getCategory());
      }
    }
  }
コード例 #2
0
ファイル: Run.java プロジェクト: biosemantics/charaparser
  public static void main(String[] args) throws IOException {
    /*for (String arg : args) {
      // option #1: By sentence.
      DocumentPreprocessor dp = new DocumentPreprocessor(arg);
      for (List<HasWord> sentence : dp) {
        System.out.println(sentence);
      }
      // option #2: By token
      PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new FileReader(arg),
              new CoreLabelTokenFactory(), "");
      while (ptbt.hasNext()) {
        CoreLabel label = ptbt.next();
        System.out.println(label);
      }
    }*/

    ITokenizer tokenizer = new WhitespaceTokenizer(); /*ITokenizer() {		
			@Override
			public List<Token> tokenize(String text) {
				List<Token> result = new LinkedList<Token>();
				
				return null;
			}
		};*/
    TaxonGroup taxonGroup = TaxonGroup.PLANT;
    WordNetPOSKnowledgeBase wordNetPOSKnowledgeBase =
        new WordNetPOSKnowledgeBase(Configuration.wordNetDirectory, false);
    SingularPluralProvider singularPluralProvider = new SingularPluralProvider();
    IInflector inflector =
        new SomeInflector(
            wordNetPOSKnowledgeBase,
            singularPluralProvider.getSingulars(),
            singularPluralProvider.getPlurals());
    Map<String, String> renames = new HashMap<String, String>();
    renames.put("count", "quantity");
    renames.put("atypical_count", "atypical_quantity");
    renames.put("color", "coloration");
    IGlossary glossary = new InMemoryGlossary();
    initGlossary(glossary, inflector, taxonGroup);
    Set<String> lifeStyles = glossary.getWordsInCategory("life_style");
    lifeStyles.addAll(glossary.getWordsInCategory("growth_form"));
    Set<String> durations = glossary.getWordsInCategory("duration");
    String negWords = "no|not|never";
    String advModifiers = "at least|at first|at times";
    String stopWords =
        "a|about|above|across|after|along|also|although|amp|an|and|are|as|at|be|because|become|becomes|becoming|been|before|being|"
            + "beneath|between|beyond|but|by|ca|can|could|did|do|does|doing|done|for|from|had|has|have|hence|here|how|however|if|in|into|inside|inward|is|it|its|"
            + "may|might|more|most|near|of|off|on|onto|or|out|outside|outward|over|should|so|than|that|the|then|there|these|this|those|throughout|"
            + "to|toward|towards|up|upward|was|were|what|when|where|which|why|with|within|without|would";
    String units =
        "(?:(?:pm|cm|mm|dm|ft|m|meters|meter|micro_m|micro-m|microns|micron|unes|µm|μm|um|centimeters|centimeter|millimeters|millimeter|transdiameters|transdiameter)[23]?)"; // squared or cubed
    ICharacterKnowledgeBase characterKnowledgeBase =
        new GlossaryBasedCharacterKnowledgeBase(
            glossary, negWords, advModifiers, stopWords, units, inflector);
    Set<String> possessionTerms = getWordSet("with|has|have|having|possess|possessing|consist_of");

    CSVReader reader =
        new CSVReader(
            new FileReader(
                "C:\\Users\\rodenhausen\\Desktop\\test-enhance\\"
                    + "Gordon_complexity_term_review\\category_mainterm_synonymterm-task-Gordon_complexity.csv"));
    List<String[]> lines = reader.readAll();
    int i = 0;
    final Map<String, SynonymSet> synonymSetsMap = new HashMap<String, SynonymSet>();
    for (String[] line : lines) {
      String preferredTerm = line[1];
      String synonym = line[2];
      if (!synonymSetsMap.containsKey(preferredTerm))
        synonymSetsMap.put(preferredTerm, new SynonymSet(preferredTerm, new HashSet<String>()));
      synonymSetsMap.get(preferredTerm).getSynonyms().add(synonym);
    }

    /*List<KnowsSynonyms> hasBiologicalEntitySynonymsList = new LinkedList<KnowsSynonyms>();
    List<KnowsSynonyms> hasCharacterSynonymsList = new LinkedList<KnowsSynonyms>();
    hasBiologicalEntitySynonymsList.add(new CSVKnowsSynonyms() {
    	@Override
    	public Set<SynonymSet> getSynonyms(String term) {
    		Set<SynonymSet> result = new HashSet<SynonymSet>();
    		for(SynonymSet synonymSet : synonymSetsMap.values()) {
    			if(synonymSet.getPreferredTerm().equals(term) || synonymSet.getSynonyms().contains(term))
    				result.add(synonymSet);
    		}
    		if(result.isEmpty())
    			result.add(new SynonymSet(term, new HashSet<String>()));
    		return result;
    	}
    });*/

    Run run = new Run();
    // AbstractTransformer transformer = new RemoveSynonyms(hasBiologicalEntitySynonymsList,
    // hasBiologicalEntitySynonymsList);
    // AbstractTransformer transformer = new CreateRelationFromCharacterConstraint(new
    // KeyWordBasedKnowsCharacterConstraintType(wordNetPOSKnowledgeBase), inflector);
    // AbstractTransformer transformer = new MoveRelationToBiologicalEntityConstraint();//new
    // KeyWordBasedKnowsCharacterConstraintType(wordNetPOSKnowledgeBase), inflector);
    // AbstractTransformer transformer = new
    // MoveNegationOrAdverbBiologicalEntityConstraint(wordNetPOSKnowledgeBase);
    /*AbstractTransformer transformer = new RemoveNonSpecificBiologicalEntitiesByRelations(new KnowsPartOf() {
    	@Override
    	public boolean isPartOf(String part, String parent) {
    		if(part.equals("apex") && parent.equals("leaf")) {
    			return true;
    		}
    		if(part.equals("base") && parent.equals("fruit")) {
    			return true;
    		}
    		if(part.equals("base") && parent.equals("petal")) {
    			return true;
    		}
    		return false;
    	}
    }, tokenizer, new CollapseBiologicalEntityToName());*/

    // AbstractTransformer transformer1 = new MoveCharacterToStructureConstraint();
    // AbstractTransformer transformer2 = new ReplaceNegationCharacterByNegationOrAbsence();

    /*AbstractTransformer transformer = new MoveModifierCharactersToBiologicalEntityConstraint(tokenizer, new KnowsEntityExistence() {
    	@Override
    	public boolean isExistsEntity(String name) {
    		if(name.equals("red leaf")) {
    			return true;
    		}
    		return false;
    	}
    });*/

    CSVKnowsSynonyms csvKnowsSynonyms = new CSVKnowsSynonyms("synonyms.csv", inflector);
    RemoveNonSpecificBiologicalEntitiesByRelations transformer1 =
        new RemoveNonSpecificBiologicalEntitiesByRelations(
            new CSVKnowsPartOf("part-of.csv", csvKnowsSynonyms, inflector),
            csvKnowsSynonyms,
            tokenizer,
            new CollapseBiologicalEntityToName());
    RemoveNonSpecificBiologicalEntitiesByBackwardConnectors transformer2 =
        new RemoveNonSpecificBiologicalEntitiesByBackwardConnectors(
            new CSVKnowsPartOf("part-of.csv", csvKnowsSynonyms, inflector),
            csvKnowsSynonyms,
            tokenizer,
            new CollapseBiologicalEntityToName());
    RemoveNonSpecificBiologicalEntitiesByForwardConnectors transformer3 =
        new RemoveNonSpecificBiologicalEntitiesByForwardConnectors(
            new CSVKnowsPartOf("part-of.csv", csvKnowsSynonyms, inflector),
            csvKnowsSynonyms,
            tokenizer,
            new CollapseBiologicalEntityToName());
    RemoveNonSpecificBiologicalEntitiesByPassedParents transformer4 =
        new RemoveNonSpecificBiologicalEntitiesByPassedParents(
            new CSVKnowsPartOf("part-of.csv", csvKnowsSynonyms, inflector),
            csvKnowsSynonyms,
            tokenizer,
            new CollapseBiologicalEntityToName(),
            inflector);
    // RemoveNonSpecificBiologicalEntitiesByCollections removeByCollections = new
    // RemoveNonSpecificBiologicalEntitiesByCollections(
    //		new CSVKnowsPartOf(csvKnowsSynonyms, inflector), csvKnowsSynonyms, new
    // CSVKnowsClassHierarchy(inflector),
    //		tokenizer, new CollapseBiologicalEntityToName(), inflector);

    run.addTransformer(new SimpleRemoveSynonyms(csvKnowsSynonyms));
    run.addTransformer(transformer1);
    run.addTransformer(transformer2);
    run.addTransformer(transformer3);
    run.addTransformer(transformer4);
    // run.addTransformer(removeByCollections);

    // run.addTransformer(transformer1);
    // run.addTransformer(transformer2);
    /*
    AbstractTransformer transformer = new CollapseCharacterToValue();
    run.addTransformer(new RemoveOrphanRelations());
    run.addTransformer(new RemoveDuplicateValues());
    run.addTransformer(new CollapseBiologicalEntityToName());
    run.addTransformer(new CollapseCharacterToValue());
    run.addTransformer(new CollapseBiologicalEntities());
    run.addTransformer(new CollapseCharacters());
    */
    /*
    run.addTransformer(new SplitCompoundBiologicalEntity(inflector));
    run.addTransformer(new SplitCompoundBiologicalEntitiesCharacters(inflector));
    run.addTransformer(new RemoveUselessWholeOrganism());
    run.addTransformer(new RemoveUselessCharacterConstraint());
    run.addTransformer(new RenameCharacter(renames));
    run.addTransformer(new MoveCharacterToStructureConstraint());
    run.addTransformer(new MoveNegationCharacterToBiologicalEntityConstraint());
    run.addTransformer(new MoveNegationOrAdverbBiologicalEntityConstraint(wordNetPOSKnowledgeBase));
    run.addTransformer(new MoveCharactersToAlternativeParent());
    run.addTransformer(new ReplaceTaxonNameByWholeOrganism());
    run.addTransformer(new CreateOrPopulateWholeOrganism(lifeStyles, "growth_form"));
    run.addTransformer(new CreateOrPopulateWholeOrganism(durations, "duration"));
    run.addTransformer(new StandardizeQuantityPresence());
    run.addTransformer(new StandardizeCount());
    run.addTransformer(new SortBiologicalEntityNameWithDistanceCharacter());
    run.addTransformer(new OrderBiologicalEntityConstraint());
    run.addTransformer(new StandardizeStructureName(characterKnowledgeBase, possessionTerms));
    run.addTransformer(new StandardizeTerminology(characterKnowledgeBase));

    run.addTransformer(new RemoveOrphanRelations());
    run.addTransformer(new RemoveDuplicateValues());
    run.addTransformer(new CollapseBiologicalEntityToName());
    run.addTransformer(new CollapseCharacterToValue());
    run.addTransformer(new CollapseBiologicalEntities());
    run.addTransformer(new CollapseCharacters());


    */

    // run.run(new File("C:\\Users\\rodenhausen\\Desktop\\test-enhance\\selection_parsed2"), new
    // File("C:\\Users\\rodenhausen\\Desktop\\test-enhance\\selection_parsed2_out_" +
    // transformer.getClass().getSimpleName()));
    run.run(new File("in"), new File("out"));
  }