예제 #1
0
  public static String canonicalizeString(String text) {
    HashSet<String> outs = new HashSet<String>();

    if (text.isEmpty()) return "";

    String original = text;

    text = Common.removeDiacritic(text);

    text = text.replaceAll("(\\(\\s*)?علیها.?لسلام(\\s*\\))?", "( ع )");
    text = text.replaceAll("(\\(\\s*)?علیها.?سلام(\\s*\\))?", "( ع )");
    text = text.replaceAll("(\\(\\s*)?علیها.?السلام(\\s*\\))?", "( ع )");
    text = text.replaceAll("(\\(\\s*)?علیه.?السلام(\\s*\\))?", "( ع )");
    text = text.replaceAll("(\\(\\s*)?سلام.?الله.?علیه(\\(\\s*)?(\\s*\\))?", "( س )");
    // text = text.replaceAll("(\\(\\s*)?صلی.?الله.?علیه.?و.?آله.?و.?سلم(\\s*\\))?", "( ص )");
    text = text.replaceAll("(\\(\\s*)?صلی.?الله.?علیه.?و.?آله(\\s*\\))?", "( ص )");
    text = text.replaceAll("(\\(\\s*)?صلی.?الله(\\s*\\))?", "( ص )");
    text = text.replace("()", "");
    text = text.replace("(  )", ""); // tokenized version

    // converting حضرت محمد (ص) and حضرت محمد to canonical form محمد (ص)

    text = text.replaceAll("حضرت ([^ ]+) \\(\\s*ص\\s*\\)", "حضرت $1");
    text = text.replaceAll("حضرت ([^ ]+) \\(\\s*ع\\s*\\)", "حضرت $1");
    text = text.replaceAll("حضرت ([^ ]+) \\(\\s*س\\s*\\)", "حضرت $1");
    text = text.replace("  ", " ");

    // either there is a stupid bug in regex or I am going bananas! the following regex does not
    // match the last parenthesis which complicates the code
    Pattern pattern = Pattern.compile("(?<!آن )\\s*(حضرت [^ ]+)");
    Matcher matcher = pattern.matcher(text);

    String modified = text;

    while (matcher.find()) {
      String search = matcher.group(1);

      String name = search.substring("حضرت".length() + 1);

      if (name.equals("محمد")) modified = modified.replace(search, name + " ( ص )");
      else modified = modified.replace(search, name + " ( ع )");
    }

    text = modified;

    text = Common.removeParenthesisWithException(text, "ص", "ع", "س");

    text = text.replace('\u200C', ' ').replace("  ", " ").trim();

    return text;
  }
예제 #2
0
  public static ArrayList<String> getLexicalTransformations(String original) {
    HashSet<String> outs = new HashSet<String>();

    if (original.isEmpty()) return new ArrayList<String>(outs);

    HashSet<String> news = new HashSet<String>();

    String canonical = Common.canonicalizeString(original);

    outs.add(canonical);

    // now other things

    String parenthesisLess = Common.removeParenthesis(canonical);

    String halfSpaceLess = canonical.replace('\u200C', ' ');

    String spaceLess = canonical.replace(' ', '\u200C');

    String puncLess = Common.removePunctuations(canonical);

    outs.addAll(news);
    news.clear();

    for (String probe : outs) {
      probe = probe.replaceAll("\\bعلیهالسلام\\b", "");
      probe = probe.replaceAll("\\bعلیها السلام\\b", "");
      probe = probe.replaceAll("\\bعلیه السلام\\b", "");
      probe = probe.replaceAll("\\bعلیه‌السلام\\b", "");
      probe = probe.replaceAll("\\(\\s*ع\\s*\\)", "");
      probe = probe.replaceAll("\\(\\s*س\\s*\\)", "");
      probe = probe.replaceAll("\\(\\s*ص\\s*\\)", "");
      probe = probe.replaceAll("\\bصلی الله علیه وآله\\b", "");
      probe = probe.replaceAll("\\bصلی الله علیه و آله\\b", "");
      probe = probe.replaceAll("\\bصلی الله علیه و آله و سلم\\b", "");
      probe = probe.replaceAll("\\bصلی الله علیه وآله و سلم\\b", "");
      probe = probe.replaceAll("\\bصلی الله علیه وآله وسلم\\b", "");
      probe = probe.replace("()", "");
      probe = probe.replace("(  )", ""); // tokenized version

      news.addAll(probeLexicalTransformation(probe, outs));
    }

    outs.addAll(news);
    news.clear();

    outs.remove(original);

    return new ArrayList<String>(outs);
  }
예제 #3
0
  public static void putFileContent(String path, String payload) {
    BufferedWriter outFile = Common.openFileForWriting(path);

    try {
      outFile.write(payload);

      outFile.close();
    } catch (IOException e) {
      MyError.exit("Couldn't write to file '" + path + "'!");
    }
  }
예제 #4
0
  /**
   * A simple string normalizer
   *
   * @param text input text
   * @return normalized text
   */
  public static String normalizeNotTokenized(String text) {
    // TODO: some concpets have '\r\n' and need them. find a way to remove 'replace("\r", "
    // ").replace("\n", " ")'. known issues if do so: permamnet concept ids file
    text =
        text.replace("ك", "ک")
            .replace("ي", "ی")
            .replace("ى", "ی")
            .replace("\r", " ")
            .replace("\n", " ");

    text =
        text.replace("ي", "ی")
            .replace("ی", "ی")
            .replace("ى", "ی")
            .replace("ك", "ک")
            .replace("ک", "ک");

    text =
        text.replaceAll(
            String.valueOf(Character.toChars(8203)), new String(Character.toChars(8204)));
    text =
        text.replaceAll(String.valueOf(Character.toChars(1609)), "ی"); // arabic letter ye maksura

    text = replaceCorresponding(text, "۰۱۲۳۴۵۶۷۸۹", "0123456789");
    text = replaceCorresponding(text, "٠١٢٣٤٥٦٧٨٩", "0123456789");

    // correcting punctuation spacings, commented as it contradicts the tokenizer's output
    // text = text.replaceAll(" ([;,،؛:])", "$1 ");
    // text = text.replaceAll("\\(", " \\(");
    // text = text.replaceAll("\\)", "\\) ");

    text = text.replace("  ", " ");
    text = Common.trimAll(text, "\" \u200C");

    return text;
  }