Java Util.unescapeXMLの例

プログラミング言語: Java

名前空間/パッケージ名: java.util

クラス/型: Util

メソッド/関数: unescapeXML

hotexamples.comのコード掲載数: 3

Java Util.unescapeXML - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたJavaのjava.util.Util.unescapeXMLの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

debug(20)

info(13)

getMessage(12)

newInternal(11)

evalString(11)

getBgColor(10)

createRandomAddress(10)

notExpected(9)

discard(9)

message(8)

notexpected(8)

bset(8)

errln(8)

getPackageName(7)

bitSetBetween(7)

objectToByteBuffer(7)

getAdmLabel(6)

stack(6)

getLabel(6)

close(6)

iswindows(5)

className(5)

swallow(5)

needToImplement(4)

objectFromByteBuffer(4)

replace(4)

sleep(4)

evalBoolean(4)

match(4)

toHEX1(4)

toIter(4)

getImageIcon(4)

extractDomain(4)

print_exception(4)

safeGet(4)

getHost(4)

createConcurrentMap(4)

pre(4)

writeAddress(3)

strToIbw(3)

getTemporaryDirectBuffer(3)

releaseTemporaryDirectBuffer(3)

random(3)

explode(3)

readAddress(3)

getProperty(3)

equiv(3)

parseStringList(3)

addBitsToTable(3)

join(3)

コード例 #1

ファイルを表示

ファイル: Util.java プロジェクト: dav009/wikistatsextractor

 public static String cleanSurfaceForms(String s) {
   int len = s.length();
   StringBuilder output = new StringBuilder();
   // the only difficulty is to deal with imbricated [[, like [[File: here
   // we [[Go]] ]], for instance
   int nb_brackets = 0;
   int start_bracket = 0;
   for (int i = 0; i < len - 1; i++) {
     char c = s.charAt(i), c_1 = s.charAt(i + 1);
     if (c == '[' && c_1 == '[') {
       if (nb_brackets == 0) start_bracket = i + 2;
       nb_brackets++;
       i++;
     }
     if (c == ']' && c_1 == ']') {
       nb_brackets--;
       if (nb_brackets == 0) {
         // time to look at what was in that link
         boolean inner_wiki_ref = false;
         int index_pipe = -1;
         for (int k = start_bracket; k < i; k++) {
           if (s.charAt(k) == ':') {
             inner_wiki_ref = true;
             break;
           }
           if (s.charAt(k) == '|') index_pipe = k + 1;
         }
         if (!inner_wiki_ref) {
           if (index_pipe != -1) {
             output.append(Util.unescapeXML(s.substring(index_pipe, i)));
           } else {
             output.append(Util.unescapeXML(s.substring(start_bracket, i)));
           }
           i += 1;
           continue;
         }
       }
     }
     if (nb_brackets == 0) {
       output.append(c);
     }
   }
   return output.toString();
 }

コード例 #2

ファイルを表示

ファイル: Util.java プロジェクト: dav009/wikistatsextractor

  /**
   * Gets correct uri of page, following any redirects.
   *
   * @param page
   * @return
   */
  public static String getResolvedPageUri(String page, String language) {
    /** look for the title of the page */
    String title = Util.getTitle(page);

    if (page.contains("<redirect title=")) {
      int start_redirect = page.indexOf("<redirect title=");
      int end_redirect = page.indexOf("/>", start_redirect);
      String redirect = page.substring(start_redirect + 16, end_redirect);
      redirect = redirect.trim();
      redirect = redirect.replaceAll("\"", "");
      redirect = Util.unescapeXML(redirect);
      return uriStandardization(redirect, null, language);

    } else {
      return uriStandardization(title, null, language);
    }
  }

コード例 #3

ファイルを表示

ファイル: Util.java プロジェクト: dav009/wikistatsextractor

  /**
   * One of the main methods. The older getCleanText was a bit more efficient, but the system of
   * intput output was shit. Anyway, this thing takes a wikipedia page as input and output a list of
   * String, one per paragraph.
   *
   * <p>By default, it will remove - the comments (&lt;!-- comment --&gt;) - the resources {| |},
   * and also the scripts {{ }}. This implies that we lose most of the tables, but I take the bet
   * that enough text will remain. - the other markups (&lt; and &gt;)
   *
   * <p>Optionally you can remove: - the lists (ignore_lists), it will remove every line that starts
   * with *,# or : - the references (&lt;ref [...] /ref&gt;) - and clean the links. if so, then
   * "[[Something|a mystery thing]]" will become "a mystery thing", and "[[File:path/to/truc]]" will
   * be removed.
   *
   * <p>filter_by_size boolean parameter says if the algorithm will throw out pages that are too
   * small. This is false for abstracts since those pages are artificially shortened.
   *
   * <p>It also unescape all the remaining characters. If the page is not an article, then returns
   * null
   */
  public static List<String> getCleanTextFromPage(
      String page,
      boolean ignore_lists,
      boolean ignore_ref,
      boolean clean_links,
      boolean filter_by_size) {
    /** if it is not a wikitext page, return null */
    if (!isWikiText(page)) {
      return null;
    }

    /** go to the index of the text */
    int index_text = page.indexOf("<text xml:space=\"preserve\">");
    if (index_text == -1) return null;
    index_text += "<text xml:space=\"preserve\">".length();

    /** locate the end */
    int end_text = page.indexOf("</text>");

    if (filter_by_size) {
      /** text is too short */
      if (end_text - index_text < 100) return null;
    }

    StringBuilder sb = new StringBuilder();
    ArrayList<String> output = new ArrayList<String>();
    int nb_accolades = 0;
    int nb_semi_accolades = 0; // detects the "{|"
    boolean is_in_ref = false;
    boolean is_in_markup = false;
    boolean is_in_comment = false;
    boolean is_in_div = false;
    int div_level = 0;
    boolean is_in_list = false;
    int nb_brackets = 0;
    int start_bracket = 0;
    int total_length = 0;

    for (int i = index_text - 1; i < end_text - 1; i++) {
      char c = page.charAt(i);
      char c_1 = page.charAt(i + 1);
      // for good measure, the text will virtually starts with a '\n' (the
      // list pattern includes '\n')
      if (i == index_text - 1) {
        c = '\n';
      }

      /** first we remove everything between {{ }} */
      if (c == '{' && c_1 == '{') {
        nb_accolades++;
        i++;
        continue;
      }
      if (c == '}' && c_1 == '}') {
        nb_accolades--;
        i++;
        continue;
      }
      if (nb_accolades > 0) continue;

      /** Then we remove everything between {| |} */
      if (c == '{' && c_1 == '|') {
        nb_semi_accolades++;
        i++;
        continue;
      }
      if (c == '|' && c_1 == '}') {
        nb_semi_accolades--;

        i++;
        continue;
      }
      if (nb_semi_accolades > 0) continue;

      /**
       * if we specify it, we can ignore everything that is in a list (start with \n* or \n# or
       * \n:).
       */
      if (ignore_lists && c == '\n' && (c_1 == '*' || c_1 == '#' || c_1 == ':')) {
        i++;
        is_in_list = true;
        i--;
        continue;
      }
      if (ignore_lists && c == '\n' && is_in_list) is_in_list = false;
      if (ignore_lists && is_in_list) continue;

      /**
       * if it is a new paragraph, so either \n\n or =\n or \n=, we put the content of the
       * StringBuilder in a new String
       */
      if (c == '\n' && (c_1 == '\n' || c_1 == '=' || page.charAt(i - 1) == '=')) {
        if (sb.length() > 1) {
          String text_paragraph = superTrim(sb.toString());
          if (text_paragraph.length() > 1) output.add(text_paragraph);
          total_length += sb.length();
        }
        /** reset the stringbuilder */
        sb.setLength(0);
        ;
      }

      /** deal with the comments (&lt;!-- --&gt;) */
      if (c == '&'
          && c_1 == 'l'
          && end_text > i + 7
          && page.substring(i, i + 7).equals("&lt;!--")) {
        is_in_comment = true;
        i += 6;
        continue;
      }
      if (c == '-' && c_1 == '-' && end_text > i + 6 && page.substring(i, i + 6).equals("--&gt;")) {
        is_in_comment = false;
        i += 5;
        continue;
      }

      /** Sometimes there are some html div (yeah...) in the dump, we try to remove then. */
      if (c == '&'
          && c_1 == 'l'
          && end_text > i + 7
          && page.substring(i, i + 7).equals("&lt;div")) {
        is_in_div = true;
        div_level++;
        i += 6;
        continue;
      }
      if (c == '&'
          && c_1 == 'l'
          && end_text > i + 6
          && page.substring(i, i + 12).equals("&lt;/div&gt;")) {
        div_level = Math.max(0, div_level - 1);
        if (div_level == 0) is_in_div = false;
        i += 11;
        continue;
      }

      /** remove the ''' ''' (bold) and === === (title) */
      if (c == '\'' && c_1 == '\'') {
        while (i < end_text && page.charAt(i) == '\'') i++;
        i--;
        continue;
      }
      if (c == '=' && c_1 == '=') {
        while (i < end_text && page.charAt(i) == '=') i++;
        i--;
        continue;
      }

      /** deals with the links */
      if (clean_links) {
        if (c == '[' && c_1 == '[') {
          nb_brackets++;
          if (nb_brackets == 1) start_bracket = i + 2;
          i++;
          continue;
        }

        if (c == ']' && c_1 == ']') {
          nb_brackets--;
          if (nb_brackets == 0 && !is_in_ref && !is_in_comment) {
            // time to look at what was in that link
            boolean inner_wiki_ref = false;
            int index_pipe = -1;
            for (int k = start_bracket; k < i; k++) {
              if (page.charAt(k) == ':') {
                inner_wiki_ref = true;
                break;
              }
              if (page.charAt(k) == '|') index_pipe = k + 1;
            }
            if (!inner_wiki_ref) {
              if (index_pipe != -1) {
                sb.append(Util.unescapeXML(page.substring(index_pipe, i)));
              } else {
                sb.append(Util.unescapeXML(page.substring(start_bracket, i)));
              }
            }
          }
          i++;
          continue;
        }
        if (nb_brackets > 0) continue;
      }

      if (ignore_ref) {
        /** deal with the references (&lt;ref&gt;) */
        if (c == '&'
            && c_1 == 'l'
            && end_text > i + 7
            && page.substring(i, i + 7).equals("&lt;ref")) {
          is_in_ref = true;
          i += 6;
          /** particular case of the <ref name="thing"/> */
          int j = i;
          while (j < end_text - 5 && !page.substring(j, j + 4).equals("&gt;")) {
            j++;
          }
          if (page.charAt(j - 1) == '/') {
            is_in_ref = false;
            i = j + 3;
          }
          continue;
        }

        if (c == '&'
            && c_1 == 'l'
            && end_text > i + 12
            && page.substring(i, i + 12).equals("&lt;/ref&gt;")) {
          is_in_ref = false;
          i += 11;
          continue;
        }
      }

      /**
       * remove other kinds of markup. A markup starts with &lt;, and there is a &gt; less than 120
       * characters away
       */
      if (c == '&'
          && !is_in_ref
          && !is_in_comment
          && end_text > i + 4
          && c_1 == 'l'
          && page.charAt(i + 2) == 't'
          && page.charAt(i + 3) == ';') {
        // look if there is a &gt; less than 100 characters away. If
        // not, we don't remove it.
        int next_gt = page.indexOf("&gt;", i);
        if (next_gt != -1 && next_gt - 120 < i) {
          is_in_markup = true;
          i += 3;
          continue;
        }
      }
      if (!is_in_ref
          && c == '&'
          && end_text > i + 4
          && c_1 == 'g'
          && page.charAt(i + 2) == 't'
          && page.charAt(i + 3) == ';') {
        is_in_markup = false;
        i += 3;
        continue;
      }

      /** And to finish, unescape remaining xml tags. See the to_unescape static variable */
      if (c == '&') {
        boolean we_made_a_replacement = false;
        for (int unescaped_index = 0; unescaped_index < to_unescape.length; unescaped_index++) {
          String pattern = to_unescape[unescaped_index];
          boolean match = true;
          if (end_text <= i + pattern.length()) continue;
          for (int i_sub = 0; i_sub < pattern.length(); i_sub++) {
            if (page.charAt(i + i_sub) != pattern.charAt(i_sub)) {
              match = false;
              break;
            }
          }
          if (match) {
            we_made_a_replacement = true;
            if (!is_in_ref && !is_in_markup && !is_in_comment && !is_in_list && !is_in_div)
              sb.append(unescaped[unescaped_index]);
            i += pattern.length() - 1;
            break;
          }
        }
        if (we_made_a_replacement) continue;
      }

      if (!is_in_ref && !is_in_markup && !is_in_comment && !is_in_list && !is_in_div) {
        sb.append(c);
      }
    }
    total_length += sb.length();

    if (filter_by_size) {
      if (total_length < 100) return null;
    }

    String text_paragraph = superTrim(sb.toString());
    if (text_paragraph.length() > 1) {
      output.add(text_paragraph);
    }
    return output;
  }