/** * A simple string normalizer * * @param text input text * @return normalized text */ public static String normalizeNotTokenized(String text) { // TODO: some concpets have '\r\n' and need them. find a way to remove 'replace("\r", " // ").replace("\n", " ")'. known issues if do so: permamnet concept ids file text = text.replace("ك", "ک") .replace("ي", "ی") .replace("ى", "ی") .replace("\r", " ") .replace("\n", " "); text = text.replace("ي", "ی") .replace("ی", "ی") .replace("ى", "ی") .replace("ك", "ک") .replace("ک", "ک"); text = text.replaceAll( String.valueOf(Character.toChars(8203)), new String(Character.toChars(8204))); text = text.replaceAll(String.valueOf(Character.toChars(1609)), "ی"); // arabic letter ye maksura text = replaceCorresponding(text, "۰۱۲۳۴۵۶۷۸۹", "0123456789"); text = replaceCorresponding(text, "٠١٢٣٤٥٦٧٨٩", "0123456789"); // correcting punctuation spacings, commented as it contradicts the tokenizer's output // text = text.replaceAll(" ([;,،؛:])", "$1 "); // text = text.replaceAll("\\(", " \\("); // text = text.replaceAll("\\)", "\\) "); text = text.replace(" ", " "); text = Common.trimAll(text, "\" \u200C"); return text; }