Exemple #1
0
 public static String cleanSurfaceForms(String s) {
   int len = s.length();
   StringBuilder output = new StringBuilder();
   // the only difficulty is to deal with imbricated [[, like [[File: here
   // we [[Go]] ]], for instance
   int nb_brackets = 0;
   int start_bracket = 0;
   for (int i = 0; i < len - 1; i++) {
     char c = s.charAt(i), c_1 = s.charAt(i + 1);
     if (c == '[' && c_1 == '[') {
       if (nb_brackets == 0) start_bracket = i + 2;
       nb_brackets++;
       i++;
     }
     if (c == ']' && c_1 == ']') {
       nb_brackets--;
       if (nb_brackets == 0) {
         // time to look at what was in that link
         boolean inner_wiki_ref = false;
         int index_pipe = -1;
         for (int k = start_bracket; k < i; k++) {
           if (s.charAt(k) == ':') {
             inner_wiki_ref = true;
             break;
           }
           if (s.charAt(k) == '|') index_pipe = k + 1;
         }
         if (!inner_wiki_ref) {
           if (index_pipe != -1) {
             output.append(Util.unescapeXML(s.substring(index_pipe, i)));
           } else {
             output.append(Util.unescapeXML(s.substring(start_bracket, i)));
           }
           i += 1;
           continue;
         }
       }
     }
     if (nb_brackets == 0) {
       output.append(c);
     }
   }
   return output.toString();
 }
Exemple #2
0
  /**
   * Gets correct uri of page, following any redirects.
   *
   * @param page
   * @return
   */
  public static String getResolvedPageUri(String page, String language) {
    /** look for the title of the page */
    String title = Util.getTitle(page);

    if (page.contains("<redirect title=")) {
      int start_redirect = page.indexOf("<redirect title=");
      int end_redirect = page.indexOf("/>", start_redirect);
      String redirect = page.substring(start_redirect + 16, end_redirect);
      redirect = redirect.trim();
      redirect = redirect.replaceAll("\"", "");
      redirect = Util.unescapeXML(redirect);
      return uriStandardization(redirect, null, language);

    } else {
      return uriStandardization(title, null, language);
    }
  }
Exemple #3
0
  /**
   * One of the main methods. The older getCleanText was a bit more efficient, but the system of
   * intput output was shit. Anyway, this thing takes a wikipedia page as input and output a list of
   * String, one per paragraph.
   *
   * <p>By default, it will remove - the comments (&lt;!-- comment --&gt;) - the resources {| |},
   * and also the scripts {{ }}. This implies that we lose most of the tables, but I take the bet
   * that enough text will remain. - the other markups (&lt; and &gt;)
   *
   * <p>Optionally you can remove: - the lists (ignore_lists), it will remove every line that starts
   * with *,# or : - the references (&lt;ref [...] /ref&gt;) - and clean the links. if so, then
   * "[[Something|a mystery thing]]" will become "a mystery thing", and "[[File:path/to/truc]]" will
   * be removed.
   *
   * <p>filter_by_size boolean parameter says if the algorithm will throw out pages that are too
   * small. This is false for abstracts since those pages are artificially shortened.
   *
   * <p>It also unescape all the remaining characters. If the page is not an article, then returns
   * null
   */
  public static List<String> getCleanTextFromPage(
      String page,
      boolean ignore_lists,
      boolean ignore_ref,
      boolean clean_links,
      boolean filter_by_size) {
    /** if it is not a wikitext page, return null */
    if (!isWikiText(page)) {
      return null;
    }

    /** go to the index of the text */
    int index_text = page.indexOf("<text xml:space=\"preserve\">");
    if (index_text == -1) return null;
    index_text += "<text xml:space=\"preserve\">".length();

    /** locate the end */
    int end_text = page.indexOf("</text>");

    if (filter_by_size) {
      /** text is too short */
      if (end_text - index_text < 100) return null;
    }

    StringBuilder sb = new StringBuilder();
    ArrayList<String> output = new ArrayList<String>();
    int nb_accolades = 0;
    int nb_semi_accolades = 0; // detects the "{|"
    boolean is_in_ref = false;
    boolean is_in_markup = false;
    boolean is_in_comment = false;
    boolean is_in_div = false;
    int div_level = 0;
    boolean is_in_list = false;
    int nb_brackets = 0;
    int start_bracket = 0;
    int total_length = 0;

    for (int i = index_text - 1; i < end_text - 1; i++) {
      char c = page.charAt(i);
      char c_1 = page.charAt(i + 1);
      // for good measure, the text will virtually starts with a '\n' (the
      // list pattern includes '\n')
      if (i == index_text - 1) {
        c = '\n';
      }

      /** first we remove everything between {{ }} */
      if (c == '{' && c_1 == '{') {
        nb_accolades++;
        i++;
        continue;
      }
      if (c == '}' && c_1 == '}') {
        nb_accolades--;
        i++;
        continue;
      }
      if (nb_accolades > 0) continue;

      /** Then we remove everything between {| |} */
      if (c == '{' && c_1 == '|') {
        nb_semi_accolades++;
        i++;
        continue;
      }
      if (c == '|' && c_1 == '}') {
        nb_semi_accolades--;

        i++;
        continue;
      }
      if (nb_semi_accolades > 0) continue;

      /**
       * if we specify it, we can ignore everything that is in a list (start with \n* or \n# or
       * \n:).
       */
      if (ignore_lists && c == '\n' && (c_1 == '*' || c_1 == '#' || c_1 == ':')) {
        i++;
        is_in_list = true;
        i--;
        continue;
      }
      if (ignore_lists && c == '\n' && is_in_list) is_in_list = false;
      if (ignore_lists && is_in_list) continue;

      /**
       * if it is a new paragraph, so either \n\n or =\n or \n=, we put the content of the
       * StringBuilder in a new String
       */
      if (c == '\n' && (c_1 == '\n' || c_1 == '=' || page.charAt(i - 1) == '=')) {
        if (sb.length() > 1) {
          String text_paragraph = superTrim(sb.toString());
          if (text_paragraph.length() > 1) output.add(text_paragraph);
          total_length += sb.length();
        }
        /** reset the stringbuilder */
        sb.setLength(0);
        ;
      }

      /** deal with the comments (&lt;!-- --&gt;) */
      if (c == '&'
          && c_1 == 'l'
          && end_text > i + 7
          && page.substring(i, i + 7).equals("&lt;!--")) {
        is_in_comment = true;
        i += 6;
        continue;
      }
      if (c == '-' && c_1 == '-' && end_text > i + 6 && page.substring(i, i + 6).equals("--&gt;")) {
        is_in_comment = false;
        i += 5;
        continue;
      }

      /** Sometimes there are some html div (yeah...) in the dump, we try to remove then. */
      if (c == '&'
          && c_1 == 'l'
          && end_text > i + 7
          && page.substring(i, i + 7).equals("&lt;div")) {
        is_in_div = true;
        div_level++;
        i += 6;
        continue;
      }
      if (c == '&'
          && c_1 == 'l'
          && end_text > i + 6
          && page.substring(i, i + 12).equals("&lt;/div&gt;")) {
        div_level = Math.max(0, div_level - 1);
        if (div_level == 0) is_in_div = false;
        i += 11;
        continue;
      }

      /** remove the ''' ''' (bold) and === === (title) */
      if (c == '\'' && c_1 == '\'') {
        while (i < end_text && page.charAt(i) == '\'') i++;
        i--;
        continue;
      }
      if (c == '=' && c_1 == '=') {
        while (i < end_text && page.charAt(i) == '=') i++;
        i--;
        continue;
      }

      /** deals with the links */
      if (clean_links) {
        if (c == '[' && c_1 == '[') {
          nb_brackets++;
          if (nb_brackets == 1) start_bracket = i + 2;
          i++;
          continue;
        }

        if (c == ']' && c_1 == ']') {
          nb_brackets--;
          if (nb_brackets == 0 && !is_in_ref && !is_in_comment) {
            // time to look at what was in that link
            boolean inner_wiki_ref = false;
            int index_pipe = -1;
            for (int k = start_bracket; k < i; k++) {
              if (page.charAt(k) == ':') {
                inner_wiki_ref = true;
                break;
              }
              if (page.charAt(k) == '|') index_pipe = k + 1;
            }
            if (!inner_wiki_ref) {
              if (index_pipe != -1) {
                sb.append(Util.unescapeXML(page.substring(index_pipe, i)));
              } else {
                sb.append(Util.unescapeXML(page.substring(start_bracket, i)));
              }
            }
          }
          i++;
          continue;
        }
        if (nb_brackets > 0) continue;
      }

      if (ignore_ref) {
        /** deal with the references (&lt;ref&gt;) */
        if (c == '&'
            && c_1 == 'l'
            && end_text > i + 7
            && page.substring(i, i + 7).equals("&lt;ref")) {
          is_in_ref = true;
          i += 6;
          /** particular case of the <ref name="thing"/> */
          int j = i;
          while (j < end_text - 5 && !page.substring(j, j + 4).equals("&gt;")) {
            j++;
          }
          if (page.charAt(j - 1) == '/') {
            is_in_ref = false;
            i = j + 3;
          }
          continue;
        }

        if (c == '&'
            && c_1 == 'l'
            && end_text > i + 12
            && page.substring(i, i + 12).equals("&lt;/ref&gt;")) {
          is_in_ref = false;
          i += 11;
          continue;
        }
      }

      /**
       * remove other kinds of markup. A markup starts with &lt;, and there is a &gt; less than 120
       * characters away
       */
      if (c == '&'
          && !is_in_ref
          && !is_in_comment
          && end_text > i + 4
          && c_1 == 'l'
          && page.charAt(i + 2) == 't'
          && page.charAt(i + 3) == ';') {
        // look if there is a &gt; less than 100 characters away. If
        // not, we don't remove it.
        int next_gt = page.indexOf("&gt;", i);
        if (next_gt != -1 && next_gt - 120 < i) {
          is_in_markup = true;
          i += 3;
          continue;
        }
      }
      if (!is_in_ref
          && c == '&'
          && end_text > i + 4
          && c_1 == 'g'
          && page.charAt(i + 2) == 't'
          && page.charAt(i + 3) == ';') {
        is_in_markup = false;
        i += 3;
        continue;
      }

      /** And to finish, unescape remaining xml tags. See the to_unescape static variable */
      if (c == '&') {
        boolean we_made_a_replacement = false;
        for (int unescaped_index = 0; unescaped_index < to_unescape.length; unescaped_index++) {
          String pattern = to_unescape[unescaped_index];
          boolean match = true;
          if (end_text <= i + pattern.length()) continue;
          for (int i_sub = 0; i_sub < pattern.length(); i_sub++) {
            if (page.charAt(i + i_sub) != pattern.charAt(i_sub)) {
              match = false;
              break;
            }
          }
          if (match) {
            we_made_a_replacement = true;
            if (!is_in_ref && !is_in_markup && !is_in_comment && !is_in_list && !is_in_div)
              sb.append(unescaped[unescaped_index]);
            i += pattern.length() - 1;
            break;
          }
        }
        if (we_made_a_replacement) continue;
      }

      if (!is_in_ref && !is_in_markup && !is_in_comment && !is_in_list && !is_in_div) {
        sb.append(c);
      }
    }
    total_length += sb.length();

    if (filter_by_size) {
      if (total_length < 100) return null;
    }

    String text_paragraph = superTrim(sb.toString());
    if (text_paragraph.length() > 1) {
      output.add(text_paragraph);
    }
    return output;
  }