Java Util.unescapeXML Exemples

Langage de programmation: Java

Espace de nommage/Pack: java.util

Class/Type: Util

Méthode/Fonction: unescapeXML

Exemples au hotexamples.com: 3

Java Util.unescapeXML - 3 exemples trouvés. Ce sont les exemples réels les mieux notés de java.util.Util.unescapeXML extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

debug(20)

info(13)

getMessage(12)

newInternal(11)

evalString(11)

getBgColor(10)

createRandomAddress(10)

notExpected(9)

discard(9)

message(8)

notexpected(8)

bset(8)

errln(8)

getPackageName(7)

bitSetBetween(7)

objectToByteBuffer(7)

getAdmLabel(6)

stack(6)

getLabel(6)

close(6)

iswindows(5)

className(5)

swallow(5)

needToImplement(4)

objectFromByteBuffer(4)

replace(4)

sleep(4)

evalBoolean(4)

match(4)

toHEX1(4)

toIter(4)

getImageIcon(4)

extractDomain(4)

print_exception(4)

safeGet(4)

getHost(4)

createConcurrentMap(4)

pre(4)

writeAddress(3)

strToIbw(3)

getTemporaryDirectBuffer(3)

releaseTemporaryDirectBuffer(3)

random(3)

explode(3)

readAddress(3)

getProperty(3)

equiv(3)

parseStringList(3)

addBitsToTable(3)

join(3)

Méthodes fréquemment utilisées

debug (20)

info (13)

getMessage (12)

newInternal (11)

evalString (11)

getBgColor (10)

createRandomAddress (10)

notExpected (9)

discard (9)

message (8)

Méthodes fréquemment utilisées

notexpected (8)

bset (8)

errln (8)

getPackageName (7)

bitSetBetween (7)

objectToByteBuffer (7)

getAdmLabel (6)

stack (6)

getLabel (6)

close (6)

iswindows (5)

className (5)

swallow (5)

needToImplement (4)

objectFromByteBuffer (4)

replace (4)

sleep (4)

evalBoolean (4)

match (4)

toHEX1 (4)

Méthodes fréquemment utilisées

iswindows (5)

className (5)

swallow (5)

needToImplement (4)

objectFromByteBuffer (4)

replace (4)

sleep (4)

evalBoolean (4)

match (4)

toHEX1 (4)

toIter (4)

getImageIcon (4)

extractDomain (4)

print_exception (4)

safeGet (4)

getHost (4)

createConcurrentMap (4)

pre (4)

writeAddress (3)

strToIbw (3)

getTemporaryDirectBuffer (3)

releaseTemporaryDirectBuffer (3)

random (3)

explode (3)

readAddress (3)

getProperty (3)

equiv (3)

parseStringList (3)

addBitsToTable (3)

join (3)

Méthodes fréquemment utilisées

toIter (4)

getImageIcon (4)

extractDomain (4)

print_exception (4)

safeGet (4)

getHost (4)

createConcurrentMap (4)

pre (4)

writeAddress (3)

strToIbw (3)

getTemporaryDirectBuffer (3)

releaseTemporaryDirectBuffer (3)

random (3)

explode (3)

readAddress (3)

getProperty (3)

equiv (3)

parseStringList (3)

addBitsToTable (3)

join (3)

objectToBuffer (3)

makeTwo (3)

replaceText (3)

getViewArea (3)

determineMergeParticipants (3)

determineMergeCoords (3)

getLabelString (3)

unescapeXML (3)

getAppIF (3)

getVjIF (3)

isDeprecated (3)

runtimeException (2)

getCommunity (2)

getBundlePoolRepository (2)

methodNameToAttributeName (2)

referencesToLong (2)

readStreamable (2)

getIndex (2)

pickNext (2)

handleCapturedInterrupt (2)

Exemple #1

0

Afficher le fichier

Fichier : Util.java Projet : dav009/wikistatsextractor

public static String cleanSurfaceForms(String s) { int len = s.length(); StringBuilder output = new StringBuilder(); // the only difficulty is to deal with imbricated [[, like [[File: here // we [[Go]] ]], for instance int nb_brackets = 0; int start_bracket = 0; for (int i = 0; i < len - 1; i++) { char c = s.charAt(i), c_1 = s.charAt(i + 1); if (c == '[' && c_1 == '[') { if (nb_brackets == 0) start_bracket = i + 2; nb_brackets++; i++; } if (c == ']' && c_1 == ']') { nb_brackets--; if (nb_brackets == 0) { // time to look at what was in that link boolean inner_wiki_ref = false; int index_pipe = -1; for (int k = start_bracket; k < i; k++) { if (s.charAt(k) == ':') { inner_wiki_ref = true; break; } if (s.charAt(k) == '|') index_pipe = k + 1; } if (!inner_wiki_ref) { if (index_pipe != -1) { output.append(Util.unescapeXML(s.substring(index_pipe, i))); } else { output.append(Util.unescapeXML(s.substring(start_bracket, i))); } i += 1; continue; } } } if (nb_brackets == 0) { output.append(c); } } return output.toString(); }

Exemple #2

0

Afficher le fichier

Fichier : Util.java Projet : dav009/wikistatsextractor

/** * Gets correct uri of page, following any redirects. * * @param page * @return */ public static String getResolvedPageUri(String page, String language) { /** look for the title of the page */ String title = Util.getTitle(page); if (page.contains("<redirect title=")) { int start_redirect = page.indexOf("<redirect title="); int end_redirect = page.indexOf("/>", start_redirect); String redirect = page.substring(start_redirect + 16, end_redirect); redirect = redirect.trim(); redirect = redirect.replaceAll("\"", ""); redirect = Util.unescapeXML(redirect); return uriStandardization(redirect, null, language); } else { return uriStandardization(title, null, language); } }

Exemple #3

0

Afficher le fichier

Fichier : Util.java Projet : dav009/wikistatsextractor

/** * One of the main methods. The older getCleanText was a bit more efficient, but the system of * intput output was shit. Anyway, this thing takes a wikipedia page as input and output a list of * String, one per paragraph. * * <p>By default, it will remove - the comments () - the resources {| |}, * and also the scripts {{ }}. This implies that we lose most of the tables, but I take the bet * that enough text will remain. - the other markups (< and >) * * <p>Optionally you can remove: - the lists (ignore_lists), it will remove every line that starts * with *,# or : - the references (<ref [...] /ref>) - and clean the links. if so, then * "[[Something|a mystery thing]]" will become "a mystery thing", and "[[File:path/to/truc]]" will * be removed. * * <p>filter_by_size boolean parameter says if the algorithm will throw out pages that are too * small. This is false for abstracts since those pages are artificially shortened. * * <p>It also unescape all the remaining characters. If the page is not an article, then returns * null */ public static List<String> getCleanTextFromPage( String page, boolean ignore_lists, boolean ignore_ref, boolean clean_links, boolean filter_by_size) { /** if it is not a wikitext page, return null */ if (!isWikiText(page)) { return null; } /** go to the index of the text */ int index_text = page.indexOf("<text xml:space=\"preserve\">"); if (index_text == -1) return null; index_text += "<text xml:space=\"preserve\">".length(); /** locate the end */ int end_text = page.indexOf("</text>"); if (filter_by_size) { /** text is too short */ if (end_text - index_text < 100) return null; } StringBuilder sb = new StringBuilder(); ArrayList<String> output = new ArrayList<String>(); int nb_accolades = 0; int nb_semi_accolades = 0; // detects the "{|" boolean is_in_ref = false; boolean is_in_markup = false; boolean is_in_comment = false; boolean is_in_div = false; int div_level = 0; boolean is_in_list = false; int nb_brackets = 0; int start_bracket = 0; int total_length = 0; for (int i = index_text - 1; i < end_text - 1; i++) { char c = page.charAt(i); char c_1 = page.charAt(i + 1); // for good measure, the text will virtually starts with a '\n' (the // list pattern includes '\n') if (i == index_text - 1) { c = '\n'; } /** first we remove everything between {{ }} */ if (c == '{' && c_1 == '{') { nb_accolades++; i++; continue; } if (c == '}' && c_1 == '}') { nb_accolades--; i++; continue; } if (nb_accolades > 0) continue; /** Then we remove everything between {| |} */ if (c == '{' && c_1 == '|') { nb_semi_accolades++; i++; continue; } if (c == '|' && c_1 == '}') { nb_semi_accolades--; i++; continue; } if (nb_semi_accolades > 0) continue; /** * if we specify it, we can ignore everything that is in a list (start with \n* or \n# or * \n:). */ if (ignore_lists && c == '\n' && (c_1 == '*' || c_1 == '#' || c_1 == ':')) { i++; is_in_list = true; i--; continue; } if (ignore_lists && c == '\n' && is_in_list) is_in_list = false; if (ignore_lists && is_in_list) continue; /** * if it is a new paragraph, so either \n\n or =\n or \n=, we put the content of the * StringBuilder in a new String */ if (c == '\n' && (c_1 == '\n' || c_1 == '=' || page.charAt(i - 1) == '=')) { if (sb.length() > 1) { String text_paragraph = superTrim(sb.toString()); if (text_paragraph.length() > 1) output.add(text_paragraph); total_length += sb.length(); } /** reset the stringbuilder */ sb.setLength(0); ; } /** deal with the comments () */ if (c == '&' && c_1 == 'l' && end_text > i + 7 && page.substring(i, i + 7).equals("")) { is_in_comment = false; i += 5; continue; } /** Sometimes there are some html div (yeah...) in the dump, we try to remove then. */ if (c == '&' && c_1 == 'l' && end_text > i + 7 && page.substring(i, i + 7).equals("<div")) { is_in_div = true; div_level++; i += 6; continue; } if (c == '&' && c_1 == 'l' && end_text > i + 6 && page.substring(i, i + 12).equals("</div>")) { div_level = Math.max(0, div_level - 1); if (div_level == 0) is_in_div = false; i += 11; continue; } /** remove the ''' ''' (bold) and === === (title) */ if (c == '\'' && c_1 == '\'') { while (i < end_text && page.charAt(i) == '\'') i++; i--; continue; } if (c == '=' && c_1 == '=') { while (i < end_text && page.charAt(i) == '=') i++; i--; continue; } /** deals with the links */ if (clean_links) { if (c == '[' && c_1 == '[') { nb_brackets++; if (nb_brackets == 1) start_bracket = i + 2; i++; continue; } if (c == ']' && c_1 == ']') { nb_brackets--; if (nb_brackets == 0 && !is_in_ref && !is_in_comment) { // time to look at what was in that link boolean inner_wiki_ref = false; int index_pipe = -1; for (int k = start_bracket; k < i; k++) { if (page.charAt(k) == ':') { inner_wiki_ref = true; break; } if (page.charAt(k) == '|') index_pipe = k + 1; } if (!inner_wiki_ref) { if (index_pipe != -1) { sb.append(Util.unescapeXML(page.substring(index_pipe, i))); } else { sb.append(Util.unescapeXML(page.substring(start_bracket, i))); } } } i++; continue; } if (nb_brackets > 0) continue; } if (ignore_ref) { /** deal with the references (<ref>) */ if (c == '&' && c_1 == 'l' && end_text > i + 7 && page.substring(i, i + 7).equals("<ref")) { is_in_ref = true; i += 6; /** particular case of the <ref name="thing"/> */ int j = i; while (j < end_text - 5 && !page.substring(j, j + 4).equals(">")) { j++; } if (page.charAt(j - 1) == '/') { is_in_ref = false; i = j + 3; } continue; } if (c == '&' && c_1 == 'l' && end_text > i + 12 && page.substring(i, i + 12).equals("</ref>")) { is_in_ref = false; i += 11; continue; } } /** * remove other kinds of markup. A markup starts with <, and there is a > less than 120 * characters away */ if (c == '&' && !is_in_ref && !is_in_comment && end_text > i + 4 && c_1 == 'l' && page.charAt(i + 2) == 't' && page.charAt(i + 3) == ';') { // look if there is a > less than 100 characters away. If // not, we don't remove it. int next_gt = page.indexOf(">", i); if (next_gt != -1 && next_gt - 120 < i) { is_in_markup = true; i += 3; continue; } } if (!is_in_ref && c == '&' && end_text > i + 4 && c_1 == 'g' && page.charAt(i + 2) == 't' && page.charAt(i + 3) == ';') { is_in_markup = false; i += 3; continue; } /** And to finish, unescape remaining xml tags. See the to_unescape static variable */ if (c == '&') { boolean we_made_a_replacement = false; for (int unescaped_index = 0; unescaped_index < to_unescape.length; unescaped_index++) { String pattern = to_unescape[unescaped_index]; boolean match = true; if (end_text <= i + pattern.length()) continue; for (int i_sub = 0; i_sub < pattern.length(); i_sub++) { if (page.charAt(i + i_sub) != pattern.charAt(i_sub)) { match = false; break; } } if (match) { we_made_a_replacement = true; if (!is_in_ref && !is_in_markup && !is_in_comment && !is_in_list && !is_in_div) sb.append(unescaped[unescaped_index]); i += pattern.length() - 1; break; } } if (we_made_a_replacement) continue; } if (!is_in_ref && !is_in_markup && !is_in_comment && !is_in_list && !is_in_div) { sb.append(c); } } total_length += sb.length(); if (filter_by_size) { if (total_length < 100) return null; } String text_paragraph = superTrim(sb.toString()); if (text_paragraph.length() > 1) { output.add(text_paragraph); } return output; }