public void setTextAbbreviationExpansionMap(File file) throws IOException { BufferedReader in = new BufferedReader(new FileReader(file)); while (in.ready()) { String s = in.readLine().trim().replaceAll("\\s+", " "); String[] tokens = s.split("\\s"); int size = tokens.length; for (int i = 0; i < size; i++) { int expansionIndex = -1; if (tokens[i].matches("\\(\\w+(\\-\\w+)?\\)(,|\\.)?") || tokens[i].matches("\\([A-Z]+(;|,|\\.)")) expansionIndex = i - 1; else if (tokens[i].matches("[A-Z]+\\)")) expansionIndex = Util.firstIndexOf(tokens, i, "\\("); if (expansionIndex == -1) continue; String abbreviation = tokens[i].replace("(", "").replace(")", "").toLowerCase(); String reversedAbbreviation = Ling.reverse(abbreviation); if (abbreviation.charAt(abbreviation.length() - 1) == ',' || abbreviation.charAt(abbreviation.length() - 1) == '.' || abbreviation.charAt(abbreviation.length() - 1) == ';') abbreviation = abbreviation.substring(0, abbreviation.length() - 1); if (textAbbreviationExpansionMap.containsKey(abbreviation) || textAbbreviationExpansionMap.containsKey(reversedAbbreviation)) continue; int abbreviationLength = abbreviation.length(); setTextAbbreviationExpansionMap(tokens, abbreviationLength, abbreviation, expansionIndex); if (!textAbbreviationExpansionMap.containsKey(abbreviation)) setTextAbbreviationExpansionMap( tokens, abbreviationLength, reversedAbbreviation, expansionIndex); } } }
public static String getBestExpansion(String text, List<String> expansionList) { int maxNumberOfContentWords = 0; int maxContainedContentWords = 0; String returnExpansion = ""; for (String expansion : expansionList) { List<String> expansionContentWordsList = Ling.getContentWordsList(expansion.split("\\s")); int tempNumberOfContentWords = expansionContentWordsList.size(); int tempContainedContentWords = 0; for (String expansionContentWord : expansionContentWordsList) { if (text.contains(" " + expansionContentWord) || text.contains(expansionContentWord + " ")) tempContainedContentWords++; } if (tempNumberOfContentWords > maxNumberOfContentWords && tempContainedContentWords == tempNumberOfContentWords) { maxNumberOfContentWords = tempNumberOfContentWords; maxContainedContentWords = 1000; returnExpansion = expansion; } else if (tempNumberOfContentWords >= maxNumberOfContentWords && tempContainedContentWords > maxContainedContentWords) { maxNumberOfContentWords = tempNumberOfContentWords; maxContainedContentWords = tempContainedContentWords; returnExpansion = expansion; } } return returnExpansion; }
private void setTextAbbreviationExpansionMap( String[] tokens, int abbreviationLength, String abbreviation, int expansionIndex) { String expansion = getTentativeExpansion(tokens, expansionIndex, abbreviationLength); expansion = Ling.correctSpelling(getExpansionByHearstAlgorithm(abbreviation, expansion).toLowerCase()) .trim(); if (!expansion.equals("")) textAbbreviationExpansionMap.put(abbreviation, expansion); }