Example #1
0
 /**
  * Modifies the name of the parent acid from ic to ate (or ous to ite) hence allowing the
  * formation of the uninverted ester
  *
  * @param parent
  * @return
  * @throws ParsingException
  */
 private static String uninvertEster(String parent) throws ParsingException {
   int len = parent.length();
   if (len == 0) {
     throw new ParsingException("Failed to uninvert CAS ester");
   }
   char lastChar = parent.charAt(len - 1);
   if (lastChar == ')') {
     if (StringTools.endsWithCaseInsensitive(parent, "ic acid)")) {
       parent = parent.substring(0, parent.length() - 8) + "ate)";
     } else if (StringTools.endsWithCaseInsensitive(parent, "ous acid)")) {
       parent = parent.substring(0, parent.length() - 9) + "ite)";
     } else if (StringTools.endsWithCaseInsensitive(parent, "ine)")) { // amino acid
       parent = parent.substring(0, parent.length() - 2) + "ate)";
     } else {
       throw new ParsingException("Failed to uninvert CAS ester");
     }
   } else {
     if (StringTools.endsWithCaseInsensitive(parent, "ic acid")) {
       parent = parent.substring(0, parent.length() - 7) + "ate";
     } else if (StringTools.endsWithCaseInsensitive(parent, "ous acid")) {
       parent = parent.substring(0, parent.length() - 8) + "ite";
     } else if (StringTools.endsWithCaseInsensitive(parent, "ine")) { // amino acid
       parent = parent.substring(0, parent.length() - 1) + "ate";
     } else {
       throw new ParsingException("Failed to uninvert CAS ester");
     }
   }
   return parent;
 }
Example #2
0
  /**
   * Inverts a CAS name. Throws an exception is OPSIN is unable to determine whether something is a
   * substituent or functional term or if something unexpected in a CAS name is encountered
   *
   * @param name
   * @return
   * @throws ParsingException
   */
  static String uninvertCASName(String name, ParseRules parseRules) throws ParsingException {
    List<String> nameComponents = new ArrayList<String>(Arrays.asList(matchCommaSpace.split(name)));
    List<String> substituents = new ArrayList<String>();
    List<String> seperateWordSubstituents = new ArrayList<String>();
    List<String> functionalTerms = new ArrayList<String>();

    String parent = nameComponents.get(0);
    String[] parentNameParts = MATCH_SPACE.split(parent);
    if (parentNameParts.length != 1) {
      if (matchCasCollectiveIndex
          .matcher(parentNameParts[parentNameParts.length - 1])
          .matches()) { // CAS collective index description should be ignored
        StringBuilder parentSB = new StringBuilder();
        for (int i = 0; i < parentNameParts.length - 1; i++) {
          parentSB.append(parentNameParts[i]);
        }
        parent = parentSB.toString();
        parentNameParts = MATCH_SPACE.split(parent);
      }
      for (int i = 1; i < parentNameParts.length; i++) {
        if (!matchAcid.matcher(parentNameParts[i]).matches()) {
          ParseRulesResults results = parseRules.getParses(parentNameParts[i]);
          List<ParseTokens> parseTokens = results.getParseTokensList();
          if (parseTokens.isEmpty()) {
            throw new ParsingException(
                "Invalid CAS name. Parent compound was followed by an unexpected term");
          }
        }
      }
    }
    boolean addedBracket = false;
    boolean esterEncountered = false;
    for (int i = 1; i < nameComponents.size(); i++) {
      String nameComponent = nameComponents.get(i);
      Matcher m = matchCompoundWithPhrase.matcher(nameComponent);
      boolean compoundWithcomponent = false;
      if (m.lookingAt()) {
        nameComponent = nameComponent.substring(m.group().length());
        compoundWithcomponent = true;
      }
      String[] components = MATCH_SPACE.split(nameComponents.get(i));
      for (String component : components) {
        if (compoundWithcomponent) {
          functionalTerms.add(component);
          continue;
        }
        if (component.endsWith("-")) {
          Character missingCloseBracket = missingCloseBracketCharIfApplicable(component);
          if (missingCloseBracket != null) {
            if (addedBracket) {
              throw new ParsingException("Close bracket appears to be missing");
            }
            parent += missingCloseBracket;
            addedBracket = true;
          }
          substituents.add(component);
        } else {
          ParseRulesResults results = parseRules.getParses(component);
          List<ParseTokens> parseTokens = results.getParseTokensList();
          if (parseTokens.size() > 0) {
            List<ParseWord> parseWords = WordTools.splitIntoParseWords(parseTokens, component);

            List<ParseTokens> firstParseWordTokens = parseWords.get(0).getParseTokens();
            WordType firstWordType =
                OpsinTools.determineWordType(firstParseWordTokens.get(0).getAnnotations());
            for (int j = 1; j < firstParseWordTokens.size(); j++) {
              if (!firstWordType.equals(
                  OpsinTools.determineWordType(firstParseWordTokens.get(j).getAnnotations()))) {
                throw new ParsingException(
                    component
                        + "can be interpreted in multiple ways. For the sake of precision OPSIN has decided not to process this as a CAS name");
              }
            }

            if (parseWords.size() == 1) {
              switch (firstWordType) {
                case functionalTerm:
                  if (component.equalsIgnoreCase("ester")) {
                    if (seperateWordSubstituents.size() == 0) {
                      throw new ParsingException(
                          "ester encountered but no substituents were specified in potential CAS name!");
                    }
                    if (esterEncountered) {
                      throw new ParsingException(
                          "ester formation was mentioned more than once in CAS name!");
                    }
                    parent = uninvertEster(parent);
                    esterEncountered = true;
                  } else {
                    functionalTerms.add(component);
                  }
                  break;
                case substituent:
                  seperateWordSubstituents.add(component);
                  break;
                case full:
                  if (StringTools.endsWithCaseInsensitive(component, "ate")
                      || StringTools.endsWithCaseInsensitive(
                          component,
                          "ite") // e.g. Piperazinium, 1,1-dimethyl-, 2,2,2-trifluoroacetate
                      // hydrochloride
                      || StringTools.endsWithCaseInsensitive(component, "ium")
                      || StringTools.endsWithCaseInsensitive(component, "hydrofluoride")
                      || StringTools.endsWithCaseInsensitive(component, "hydrochloride")
                      || StringTools.endsWithCaseInsensitive(component, "hydrobromide")
                      || StringTools.endsWithCaseInsensitive(component, "hydroiodide")) {
                    functionalTerms.add(component);
                  } else {
                    throw new ParsingException(
                        "Unable to interpret: "
                            + component
                            + " (as part of a CAS index name)- A full word was encountered where a substituent or functionalTerm was expected");
                  }
                  break;
                default:
                  throw new ParsingException("Unrecognised CAS index name form");
              }
            } else if (parseWords.size() == 2 && firstWordType.equals(WordType.substituent)) {
              // could be something like O-methyloxime which is parsed as [O-methyl] [oxime]
              List<ParseTokens> secondParseWordTokens = parseWords.get(1).getParseTokens();
              WordType secondWordType =
                  OpsinTools.determineWordType(secondParseWordTokens.get(0).getAnnotations());
              for (int j = 1; j < secondParseWordTokens.size(); j++) {
                if (!secondWordType.equals(
                    OpsinTools.determineWordType(secondParseWordTokens.get(j).getAnnotations()))) {
                  throw new ParsingException(
                      component
                          + "can be interpreted in multiple ways. For the sake of precision OPSIN has decided not to process this as a CAS name");
                }
              }
              if (secondWordType.equals(WordType.functionalTerm)
                  && matchFunctionalTermAllowingSubstituentPrefix
                      .matcher(parseWords.get(1).getWord())
                      .matches()) {
                functionalTerms.add(component);
              } else {
                throw new ParsingException(
                    "Unrecognised CAS index name form, could have a missing space?");
              }
            } else {
              throw new ParsingException("Unrecognised CAS index name form");
            }
          } else {
            if (!matchCasCollectiveIndex
                .matcher(component)
                .matches()) { // CAS collective index description should be ignored
              throw new ParsingException(
                  "Unable to interpret: " + component + " (as part of a CAS index name)");
            }
          }
        }
      }
    }
    StringBuilder casName = new StringBuilder();
    for (String prefixFunctionalTerm : seperateWordSubstituents) {
      casName.append(prefixFunctionalTerm);
      casName.append(" ");
    }
    for (int i = substituents.size() - 1; i >= 0; i--) {
      // stereochemistry term comes after substituent term. In older CAS names (9CI) this
      // stereochemistry term can apply to the substituent term. Hence append in reverse order
      casName.append(substituents.get(i));
    }
    casName.append(parent);
    for (String functionalTerm : functionalTerms) {
      casName.append(" ");
      casName.append(functionalTerm);
    }
    return casName.toString();
  }