예제 #1
0
  /**
   * A simple string normalizer
   *
   * @param text input text
   * @return normalized text
   */
  public static String normalizeNotTokenized(String text) {
    // TODO: some concpets have '\r\n' and need them. find a way to remove 'replace("\r", "
    // ").replace("\n", " ")'. known issues if do so: permamnet concept ids file
    text =
        text.replace("ك", "ک")
            .replace("ي", "ی")
            .replace("ى", "ی")
            .replace("\r", " ")
            .replace("\n", " ");

    text =
        text.replace("ي", "ی")
            .replace("ی", "ی")
            .replace("ى", "ی")
            .replace("ك", "ک")
            .replace("ک", "ک");

    text =
        text.replaceAll(
            String.valueOf(Character.toChars(8203)), new String(Character.toChars(8204)));
    text =
        text.replaceAll(String.valueOf(Character.toChars(1609)), "ی"); // arabic letter ye maksura

    text = replaceCorresponding(text, "۰۱۲۳۴۵۶۷۸۹", "0123456789");
    text = replaceCorresponding(text, "٠١٢٣٤٥٦٧٨٩", "0123456789");

    // correcting punctuation spacings, commented as it contradicts the tokenizer's output
    // text = text.replaceAll(" ([;,،؛:])", "$1 ");
    // text = text.replaceAll("\\(", " \\(");
    // text = text.replaceAll("\\)", "\\) ");

    text = text.replace("  ", " ");
    text = Common.trimAll(text, "\" \u200C");

    return text;
  }