public static String canonicalizeString(String text) { HashSet<String> outs = new HashSet<String>(); if (text.isEmpty()) return ""; String original = text; text = Common.removeDiacritic(text); text = text.replaceAll("(\\(\\s*)?علیها.?لسلام(\\s*\\))?", "( ع )"); text = text.replaceAll("(\\(\\s*)?علیها.?سلام(\\s*\\))?", "( ع )"); text = text.replaceAll("(\\(\\s*)?علیها.?السلام(\\s*\\))?", "( ع )"); text = text.replaceAll("(\\(\\s*)?علیه.?السلام(\\s*\\))?", "( ع )"); text = text.replaceAll("(\\(\\s*)?سلام.?الله.?علیه(\\(\\s*)?(\\s*\\))?", "( س )"); // text = text.replaceAll("(\\(\\s*)?صلی.?الله.?علیه.?و.?آله.?و.?سلم(\\s*\\))?", "( ص )"); text = text.replaceAll("(\\(\\s*)?صلی.?الله.?علیه.?و.?آله(\\s*\\))?", "( ص )"); text = text.replaceAll("(\\(\\s*)?صلی.?الله(\\s*\\))?", "( ص )"); text = text.replace("()", ""); text = text.replace("( )", ""); // tokenized version // converting حضرت محمد (ص) and حضرت محمد to canonical form محمد (ص) text = text.replaceAll("حضرت ([^ ]+) \\(\\s*ص\\s*\\)", "حضرت $1"); text = text.replaceAll("حضرت ([^ ]+) \\(\\s*ع\\s*\\)", "حضرت $1"); text = text.replaceAll("حضرت ([^ ]+) \\(\\s*س\\s*\\)", "حضرت $1"); text = text.replace(" ", " "); // either there is a stupid bug in regex or I am going bananas! the following regex does not // match the last parenthesis which complicates the code Pattern pattern = Pattern.compile("(?<!آن )\\s*(حضرت [^ ]+)"); Matcher matcher = pattern.matcher(text); String modified = text; while (matcher.find()) { String search = matcher.group(1); String name = search.substring("حضرت".length() + 1); if (name.equals("محمد")) modified = modified.replace(search, name + " ( ص )"); else modified = modified.replace(search, name + " ( ع )"); } text = modified; text = Common.removeParenthesisWithException(text, "ص", "ع", "س"); text = text.replace('\u200C', ' ').replace(" ", " ").trim(); return text; }
public static ArrayList<String> getLexicalTransformations(String original) { HashSet<String> outs = new HashSet<String>(); if (original.isEmpty()) return new ArrayList<String>(outs); HashSet<String> news = new HashSet<String>(); String canonical = Common.canonicalizeString(original); outs.add(canonical); // now other things String parenthesisLess = Common.removeParenthesis(canonical); String halfSpaceLess = canonical.replace('\u200C', ' '); String spaceLess = canonical.replace(' ', '\u200C'); String puncLess = Common.removePunctuations(canonical); outs.addAll(news); news.clear(); for (String probe : outs) { probe = probe.replaceAll("\\bعلیهالسلام\\b", ""); probe = probe.replaceAll("\\bعلیها السلام\\b", ""); probe = probe.replaceAll("\\bعلیه السلام\\b", ""); probe = probe.replaceAll("\\bعلیهالسلام\\b", ""); probe = probe.replaceAll("\\(\\s*ع\\s*\\)", ""); probe = probe.replaceAll("\\(\\s*س\\s*\\)", ""); probe = probe.replaceAll("\\(\\s*ص\\s*\\)", ""); probe = probe.replaceAll("\\bصلی الله علیه وآله\\b", ""); probe = probe.replaceAll("\\bصلی الله علیه و آله\\b", ""); probe = probe.replaceAll("\\bصلی الله علیه و آله و سلم\\b", ""); probe = probe.replaceAll("\\bصلی الله علیه وآله و سلم\\b", ""); probe = probe.replaceAll("\\bصلی الله علیه وآله وسلم\\b", ""); probe = probe.replace("()", ""); probe = probe.replace("( )", ""); // tokenized version news.addAll(probeLexicalTransformation(probe, outs)); } outs.addAll(news); news.clear(); outs.remove(original); return new ArrayList<String>(outs); }
public static void putFileContent(String path, String payload) { BufferedWriter outFile = Common.openFileForWriting(path); try { outFile.write(payload); outFile.close(); } catch (IOException e) { MyError.exit("Couldn't write to file '" + path + "'!"); } }
/** * A simple string normalizer * * @param text input text * @return normalized text */ public static String normalizeNotTokenized(String text) { // TODO: some concpets have '\r\n' and need them. find a way to remove 'replace("\r", " // ").replace("\n", " ")'. known issues if do so: permamnet concept ids file text = text.replace("ك", "ک") .replace("ي", "ی") .replace("ى", "ی") .replace("\r", " ") .replace("\n", " "); text = text.replace("ي", "ی") .replace("ی", "ی") .replace("ى", "ی") .replace("ك", "ک") .replace("ک", "ک"); text = text.replaceAll( String.valueOf(Character.toChars(8203)), new String(Character.toChars(8204))); text = text.replaceAll(String.valueOf(Character.toChars(1609)), "ی"); // arabic letter ye maksura text = replaceCorresponding(text, "۰۱۲۳۴۵۶۷۸۹", "0123456789"); text = replaceCorresponding(text, "٠١٢٣٤٥٦٧٨٩", "0123456789"); // correcting punctuation spacings, commented as it contradicts the tokenizer's output // text = text.replaceAll(" ([;,،؛:])", "$1 "); // text = text.replaceAll("\\(", " \\("); // text = text.replaceAll("\\)", "\\) "); text = text.replace(" ", " "); text = Common.trimAll(text, "\" \u200C"); return text; }