public static String cleanSurfaceForms(String s) { int len = s.length(); StringBuilder output = new StringBuilder(); // the only difficulty is to deal with imbricated [[, like [[File: here // we [[Go]] ]], for instance int nb_brackets = 0; int start_bracket = 0; for (int i = 0; i < len - 1; i++) { char c = s.charAt(i), c_1 = s.charAt(i + 1); if (c == '[' && c_1 == '[') { if (nb_brackets == 0) start_bracket = i + 2; nb_brackets++; i++; } if (c == ']' && c_1 == ']') { nb_brackets--; if (nb_brackets == 0) { // time to look at what was in that link boolean inner_wiki_ref = false; int index_pipe = -1; for (int k = start_bracket; k < i; k++) { if (s.charAt(k) == ':') { inner_wiki_ref = true; break; } if (s.charAt(k) == '|') index_pipe = k + 1; } if (!inner_wiki_ref) { if (index_pipe != -1) { output.append(Util.unescapeXML(s.substring(index_pipe, i))); } else { output.append(Util.unescapeXML(s.substring(start_bracket, i))); } i += 1; continue; } } } if (nb_brackets == 0) { output.append(c); } } return output.toString(); }
/** * Gets correct uri of page, following any redirects. * * @param page * @return */ public static String getResolvedPageUri(String page, String language) { /** look for the title of the page */ String title = Util.getTitle(page); if (page.contains("<redirect title=")) { int start_redirect = page.indexOf("<redirect title="); int end_redirect = page.indexOf("/>", start_redirect); String redirect = page.substring(start_redirect + 16, end_redirect); redirect = redirect.trim(); redirect = redirect.replaceAll("\"", ""); redirect = Util.unescapeXML(redirect); return uriStandardization(redirect, null, language); } else { return uriStandardization(title, null, language); } }
/** * One of the main methods. The older getCleanText was a bit more efficient, but the system of * intput output was shit. Anyway, this thing takes a wikipedia page as input and output a list of * String, one per paragraph. * * <p>By default, it will remove - the comments (<!-- comment -->) - the resources {| |}, * and also the scripts {{ }}. This implies that we lose most of the tables, but I take the bet * that enough text will remain. - the other markups (< and >) * * <p>Optionally you can remove: - the lists (ignore_lists), it will remove every line that starts * with *,# or : - the references (<ref [...] /ref>) - and clean the links. if so, then * "[[Something|a mystery thing]]" will become "a mystery thing", and "[[File:path/to/truc]]" will * be removed. * * <p>filter_by_size boolean parameter says if the algorithm will throw out pages that are too * small. This is false for abstracts since those pages are artificially shortened. * * <p>It also unescape all the remaining characters. If the page is not an article, then returns * null */ public static List<String> getCleanTextFromPage( String page, boolean ignore_lists, boolean ignore_ref, boolean clean_links, boolean filter_by_size) { /** if it is not a wikitext page, return null */ if (!isWikiText(page)) { return null; } /** go to the index of the text */ int index_text = page.indexOf("<text xml:space=\"preserve\">"); if (index_text == -1) return null; index_text += "<text xml:space=\"preserve\">".length(); /** locate the end */ int end_text = page.indexOf("</text>"); if (filter_by_size) { /** text is too short */ if (end_text - index_text < 100) return null; } StringBuilder sb = new StringBuilder(); ArrayList<String> output = new ArrayList<String>(); int nb_accolades = 0; int nb_semi_accolades = 0; // detects the "{|" boolean is_in_ref = false; boolean is_in_markup = false; boolean is_in_comment = false; boolean is_in_div = false; int div_level = 0; boolean is_in_list = false; int nb_brackets = 0; int start_bracket = 0; int total_length = 0; for (int i = index_text - 1; i < end_text - 1; i++) { char c = page.charAt(i); char c_1 = page.charAt(i + 1); // for good measure, the text will virtually starts with a '\n' (the // list pattern includes '\n') if (i == index_text - 1) { c = '\n'; } /** first we remove everything between {{ }} */ if (c == '{' && c_1 == '{') { nb_accolades++; i++; continue; } if (c == '}' && c_1 == '}') { nb_accolades--; i++; continue; } if (nb_accolades > 0) continue; /** Then we remove everything between {| |} */ if (c == '{' && c_1 == '|') { nb_semi_accolades++; i++; continue; } if (c == '|' && c_1 == '}') { nb_semi_accolades--; i++; continue; } if (nb_semi_accolades > 0) continue; /** * if we specify it, we can ignore everything that is in a list (start with \n* or \n# or * \n:). */ if (ignore_lists && c == '\n' && (c_1 == '*' || c_1 == '#' || c_1 == ':')) { i++; is_in_list = true; i--; continue; } if (ignore_lists && c == '\n' && is_in_list) is_in_list = false; if (ignore_lists && is_in_list) continue; /** * if it is a new paragraph, so either \n\n or =\n or \n=, we put the content of the * StringBuilder in a new String */ if (c == '\n' && (c_1 == '\n' || c_1 == '=' || page.charAt(i - 1) == '=')) { if (sb.length() > 1) { String text_paragraph = superTrim(sb.toString()); if (text_paragraph.length() > 1) output.add(text_paragraph); total_length += sb.length(); } /** reset the stringbuilder */ sb.setLength(0); ; } /** deal with the comments (<!-- -->) */ if (c == '&' && c_1 == 'l' && end_text > i + 7 && page.substring(i, i + 7).equals("<!--")) { is_in_comment = true; i += 6; continue; } if (c == '-' && c_1 == '-' && end_text > i + 6 && page.substring(i, i + 6).equals("-->")) { is_in_comment = false; i += 5; continue; } /** Sometimes there are some html div (yeah...) in the dump, we try to remove then. */ if (c == '&' && c_1 == 'l' && end_text > i + 7 && page.substring(i, i + 7).equals("<div")) { is_in_div = true; div_level++; i += 6; continue; } if (c == '&' && c_1 == 'l' && end_text > i + 6 && page.substring(i, i + 12).equals("</div>")) { div_level = Math.max(0, div_level - 1); if (div_level == 0) is_in_div = false; i += 11; continue; } /** remove the ''' ''' (bold) and === === (title) */ if (c == '\'' && c_1 == '\'') { while (i < end_text && page.charAt(i) == '\'') i++; i--; continue; } if (c == '=' && c_1 == '=') { while (i < end_text && page.charAt(i) == '=') i++; i--; continue; } /** deals with the links */ if (clean_links) { if (c == '[' && c_1 == '[') { nb_brackets++; if (nb_brackets == 1) start_bracket = i + 2; i++; continue; } if (c == ']' && c_1 == ']') { nb_brackets--; if (nb_brackets == 0 && !is_in_ref && !is_in_comment) { // time to look at what was in that link boolean inner_wiki_ref = false; int index_pipe = -1; for (int k = start_bracket; k < i; k++) { if (page.charAt(k) == ':') { inner_wiki_ref = true; break; } if (page.charAt(k) == '|') index_pipe = k + 1; } if (!inner_wiki_ref) { if (index_pipe != -1) { sb.append(Util.unescapeXML(page.substring(index_pipe, i))); } else { sb.append(Util.unescapeXML(page.substring(start_bracket, i))); } } } i++; continue; } if (nb_brackets > 0) continue; } if (ignore_ref) { /** deal with the references (<ref>) */ if (c == '&' && c_1 == 'l' && end_text > i + 7 && page.substring(i, i + 7).equals("<ref")) { is_in_ref = true; i += 6; /** particular case of the <ref name="thing"/> */ int j = i; while (j < end_text - 5 && !page.substring(j, j + 4).equals(">")) { j++; } if (page.charAt(j - 1) == '/') { is_in_ref = false; i = j + 3; } continue; } if (c == '&' && c_1 == 'l' && end_text > i + 12 && page.substring(i, i + 12).equals("</ref>")) { is_in_ref = false; i += 11; continue; } } /** * remove other kinds of markup. A markup starts with <, and there is a > less than 120 * characters away */ if (c == '&' && !is_in_ref && !is_in_comment && end_text > i + 4 && c_1 == 'l' && page.charAt(i + 2) == 't' && page.charAt(i + 3) == ';') { // look if there is a > less than 100 characters away. If // not, we don't remove it. int next_gt = page.indexOf(">", i); if (next_gt != -1 && next_gt - 120 < i) { is_in_markup = true; i += 3; continue; } } if (!is_in_ref && c == '&' && end_text > i + 4 && c_1 == 'g' && page.charAt(i + 2) == 't' && page.charAt(i + 3) == ';') { is_in_markup = false; i += 3; continue; } /** And to finish, unescape remaining xml tags. See the to_unescape static variable */ if (c == '&') { boolean we_made_a_replacement = false; for (int unescaped_index = 0; unescaped_index < to_unescape.length; unescaped_index++) { String pattern = to_unescape[unescaped_index]; boolean match = true; if (end_text <= i + pattern.length()) continue; for (int i_sub = 0; i_sub < pattern.length(); i_sub++) { if (page.charAt(i + i_sub) != pattern.charAt(i_sub)) { match = false; break; } } if (match) { we_made_a_replacement = true; if (!is_in_ref && !is_in_markup && !is_in_comment && !is_in_list && !is_in_div) sb.append(unescaped[unescaped_index]); i += pattern.length() - 1; break; } } if (we_made_a_replacement) continue; } if (!is_in_ref && !is_in_markup && !is_in_comment && !is_in_list && !is_in_div) { sb.append(c); } } total_length += sb.length(); if (filter_by_size) { if (total_length < 100) return null; } String text_paragraph = superTrim(sb.toString()); if (text_paragraph.length() > 1) { output.add(text_paragraph); } return output; }