예제 #1
0
  /**
   * Gets the content of the article, and creates the final section for the generated report (html)
   *
   * @param aLink
   * @return
   */
  public static String getArticleContent(String aLink, ArticleBodyParser parser) {
    Document doc = null;
    String htmltext = "";
    if (Objects.nonNull(aLink) && !aLink.isEmpty()) {
      try {
        doc =
            Jsoup.connect(aLink)
                .header("Accept-Encoding", "gzip, deflate")
                .userAgent(userAgent)
                .timeout(6000)
                .followRedirects(true)
                .maxBodySize(0)
                .get();
      } catch (IOException e) {
        LOGGER.log(
            Level.WARNING, "Error connecting, while fetching the article with link " + aLink);
      }
      if (doc != null) {
        Element title = doc.getElementsByTag("title").first();
        Element mainArticle = parser.parseArticleFromDoc(doc);
        if (Objects.nonNull(title) && Objects.nonNull(mainArticle)) {
          htmltext =
              HtmlContentWriterUtil.generateArticleHtml(
                  title.html(), aLink, Jsoup.clean(mainArticle.html(), Whitelist.basic()));

        } else {
          LOGGER.warning("We could not fetch the title and main body for link: " + aLink);
        }
      }
    }
    return htmltext;
  }
예제 #2
0
 /**
  * Pulls a page and attempts to discover a feed for it via link[rel='alternate'].
  *
  * @param url The URL of the page to try and discover the feed for.
  * @return The feedsource if matched or created, may be null.
  * @throws ClientProtocolException If the page could not be pulled.
  * @throws IOException If the page could not be pulled.
  * @throws DataOperationException If a query could not be executed.
  */
 public FeedSource discover(final String url)
     throws ClientProtocolException, IOException, DataOperationException {
   log.fine("Discovering feed for " + url);
   try (final CloseableHttpClient client = HttpClientBuilder.create().build()) {
     final HttpGet get = new HttpGet(url);
     try (final CloseableHttpResponse response = client.execute(get)) {
       final String html = EntityUtils.toString(response.getEntity());
       final Document doc = Jsoup.parse(html);
       final Elements alternateLinks = doc.select("link");
       for (final Element alternateLink : alternateLinks) {
         if ("alternate".equals(alternateLink.attr("rel"))) {
           if ("application/rss+xml".equals(alternateLink.attr("type"))) {
             log.fine("Found rss link " + alternateLink.attr("href"));
             final String rss = alternateLink.attr("href");
             return this.feedSourceManager.findOrCreateByFeedUrl(rss);
           }
           log.fine("Found alternate link " + alternateLink.html());
         } else {
           log.fine("Found link " + alternateLink.html());
         }
       }
     }
   }
   return null;
 }
예제 #3
0
 @Test
 public void testSetHtml() {
   Document doc = Jsoup.parse("<div id=1><p>Hello</p></div>");
   Element div = doc.getElementById("1");
   div.html("<p>there</p><p>now</p>");
   assertEquals("<p>there</p><p>now</p>", TextUtil.stripNewlines(div.html()));
 }
예제 #4
0
  @Test
  public void testClonesClassnames() {
    Document doc = Jsoup.parse("<div class='one two'></div>");
    Element div = doc.select("div").first();
    Set<String> classes = div.classNames();
    assertEquals(2, classes.size());
    assertTrue(classes.contains("one"));
    assertTrue(classes.contains("two"));

    Element copy = div.clone();
    Set<String> copyClasses = copy.classNames();
    assertEquals(2, copyClasses.size());
    assertTrue(copyClasses.contains("one"));
    assertTrue(copyClasses.contains("two"));
    copyClasses.add("three");
    copyClasses.remove("one");

    assertTrue(classes.contains("one"));
    assertFalse(classes.contains("three"));
    assertFalse(copyClasses.contains("one"));
    assertTrue(copyClasses.contains("three"));

    assertEquals("", div.html());
    assertEquals("", copy.html());
  }
예제 #5
0
  private RawBankAccount obtainBankAccountFromHtmlTableRow(String type, Element row) {
    if ("detail".equalsIgnoreCase(row.attr("class"))) {
      // detail row
      return null;
    }

    if ("bg0".equalsIgnoreCase(row.attr("class"))) {
      Log.v(TAG, "working row(" + type + "): " + row.html());

      if ("Current Accounts".equalsIgnoreCase(type)) {
        return new RawBankAccount()
            .setServerId(row.child(2).text())
            .setName(row.child(0).child(0).text())
            .setIBAN(row.child(2).text())
            .setCurrency(row.child(1).text())
            .setBalance(Convert.strToFloat(row.child(3).text()))
            .setAvailableBalance(Convert.strToFloat(row.child(4).text()));
      } else if ("Cards".equalsIgnoreCase(type)) {
        // skip cards for now
        return null;
      } else {
        // unknown type
        return null;
      }
    } else {
      return null;
    }
  }
예제 #6
0
  public String getDomainName(String url) throws IOException, InterruptedException {

    String[] split = goodGoogleSources.split(";");

    for (String st : split) {
      if (url.contains(st)) {
        String request = String.format("https://www.google.com%s", url);
        System.out.println("request=" + request);
        Document doc =
            Jsoup.connect(request)
                .userAgent(
                    "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)")
                .timeout(5000)
                .get();

        Elements paragraphs = doc.select("p");
        final StringBuilder sb = new StringBuilder();
        String data = "";
        for (Element p : paragraphs) {
          return p.html();
        }
      }
    }
    return "";
  }
 public static void schemaList(Element list, List<Skeema> skeemas) {
   list.html("");
   int i = 0;
   for (Skeema s : skeemas) {
     if (i < 11) {
       Element elm =
           list.appendElement("li")
               .appendElement("a")
               .attr("href", "/schema/" + s.getId().toString())
               .text(s.skeemaID);
     } else if (i == 11) {
       Element elm =
           list.appendElement("li")
               .appendElement("a")
               .attr("href", "#")
               .addClass("moreSchemas")
               .text("MORE...");
     } else {
       list.appendElement("li")
           .appendElement("a")
           .addClass("hiddenSchema hidden")
           .attr("href", "/schema/" + s.getId().toString())
           .text(s.skeemaID);
     }
     i++;
   }
 }
예제 #8
0
 public static void processEpub(String bookPath, String dest)
     throws FileNotFoundException, IOException {
   EpubReader reader = new EpubReader();
   Book b = reader.readEpub(new FileInputStream(new File(bookPath)));
   String content = "";
   int pagecount = 1;
   int tempCounter;
   Count cnt = new Count(0, 0);
   for (Resource res : b.getContents()) {
     content = new String(res.getData());
     Document doc = Jsoup.parse(content, "UTF-8");
     // http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"");
     Element elem = new Element(Tag.valueOf("meta"), "");
     elem.attr("http-equiv", "content-type");
     elem.attr("content", "text/html; charset=utf-8");
     doc.head().after(elem);
     System.out.println(doc.head().data());
     Element ele = doc.body();
     alterElement(ele);
     Count cTemp = modify(ele, cnt);
     cnt.setCount(cTemp.getCount());
     cnt.setPgCount(cTemp.getPgCount());
     doc.body().html(ele.html());
     res.setData(doc.html().getBytes());
     if (res.getMediaType() == null) res.setMediaType(new MediaType("html", "html"));
   }
   EpubWriter wr = new EpubWriter();
   wr.write(b, new FileOutputStream(new File(dest)));
 }
예제 #9
0
 @Test
 public void testAddNewText() {
   Document doc = Jsoup.parse("<div id=1><p>Hello</p></div>");
   Element div = doc.getElementById("1");
   div.appendText(" there & now >");
   assertEquals("<p>Hello</p> there &amp; now &gt;", TextUtil.stripNewlines(div.html()));
 }
예제 #10
0
  @Override
  public Object parseHtml2Obj(String html) {
    Document doc = Jsoup.parse(html);
    Element title = doc.getElementById("activity-name");
    Element createtime = doc.getElementById("post-date");
    // Element from = doc.getElementById("post-user");
    Element content = doc.getElementById("essay-body");
    Elements pic = doc.select("#media img");
    Elements _intro = doc.select(".text p");
    String intro = null;
    if (_intro.isEmpty()) {
      intro = "阅读全部";
    } else {
      intro = _intro.first().text();
    }

    // List<ArticleObj> objs = new ArrayList<ArticleObj>();
    ArticleObj obj = new ArticleObj();
    obj.setFrom(account_desc);
    obj.setContent(content.html());
    obj.setCreatetime(createtime.text());
    obj.setTitle(title.text());
    obj.setIntro(intro.substring(0, intro.length() > 50 ? 50 : intro.length()) + "...");
    if (!pic.isEmpty()) {
      String src = pic.get(0).attr("src");
      obj.setPic(getSrc(src));
    }
    System.err.println(obj.getPic());
    dbRobot.AddArticleData(obj);
    cur_count++;
    return null;
  }
 private void makeVariations() {
   Elements variations = page.select("div.route_variations");
   direction = Direction.IN;
   for (Element directionVariation : variations) {
     direction.clearVariations();
     for (Element variation : directionVariation.select("p")) {
       String variationDescription =
           variation
               .html()
               .replaceAll("&nbsp;", " ")
               .replaceAll("<(br|BR)>", "\n")
               .replaceAll("</?[a-zA-Z]+.*?>", "")
               .replaceAll("([ \t\\xA0])+", " ");
       if (variationDescription.length() > 3) {
         String[] lines = variationDescription.split("\n+");
         for (String line : lines) {
           if (line.length() > 3) {
             String[] parts = line.trim().split(" ", 2);
             String variationInitial = parts[0];
             String description = parts[1];
             if (!code.equals(variationInitial)) {
               direction.addVariation(variationInitial, description);
             }
           }
         }
       }
     }
     direction = Direction.OUT;
   }
 }
예제 #12
0
  public String reviseImgForWX(String pcont) {
    if (pcont == null) return "";

    Document doc = Jsoup.parse(pcont);
    Elements eleimages = doc.select("img");
    if (eleimages.size() > 0) {
      for (Element img : eleimages) {
        String source = img.attr("data-src");
        int pos = source.lastIndexOf("/") + 1;
        source = source.substring(0, pos);
        img.removeAttr("data-s");
        img.removeAttr("data-src");
        img.removeAttr("data-w");
        img.attr("src", source + "640");
        img.attr("max-width", "640");
      }
    }
    Elements elesrp = doc.select("script");
    Elements divs = doc.select("div");
    if (elesrp.size() > 0 && divs.size() > 0) {

      for (Element ele : elesrp) {
        String s = ele.html();
        Pattern p = Pattern.compile("(?<=(var\\scover\\s=\\s\"))\\S+(?=\")");
        Matcher m = p.matcher(s);
        if (m.find()) {
          String nimg = "<img src=\"" + m.group() + "\"/>";
          divs.get(0).before(nimg);
        }
      }
    }
    return doc.html();
  }
예제 #13
0
  private static void parseTitle(Topic.Builder topicBuilder, Element ele) {
    ele = ele.select(".item_title > a").get(0);
    Preconditions.checkState(ele.tagName().equals("a"));
    String url = ele.attr("href");

    topicBuilder.setId(Topic.getIdFromUrl(url));
    topicBuilder.setTitle(ele.html());
  }
예제 #14
0
 @Test
 public void test01() throws Exception {
   String url = "http://search.jd.com/Search?keyword=OReilly&enc=utf-8&book=y&wq=OReilly";
   // Connection connect = Jsoup.connect(url);
   // Connection.Response execute = connect.execute();
   Document parse = Jsoup.parse(new URL(url), 5000);
   Elements elements = parse.select(".p-name em");
   for (Element element : elements) {
     System.out.println(element.html());
   }
 }
예제 #15
0
  @Test
  public void testNotPretty() {
    Document doc = Jsoup.parse("<div>   \n<p>Hello\n there\n</p></div>");
    doc.outputSettings().prettyPrint(false);
    assertEquals(
        "<html><head></head><body><div>   \n<p>Hello\n there\n</p></div></body></html>",
        doc.html());

    Element div = doc.select("div").first();
    assertEquals("   \n<p>Hello\n there\n</p>", div.html());
  }
예제 #16
0
  /**
   * 解析回帖列表
   *
   * @param content
   * @return
   */
  public static List<Post> parsePostList(String content) {
    long s = System.currentTimeMillis();

    List<Post> posts = new ArrayList<Post>();
    Document document = Jsoup.parse(content);
    document.setBaseUri(Constants.BASE_URL);
    Elements elements = document.getElementsByClass("plc");
    for (Element plc : elements) {
      try {
        Post post = new Post();
        // 解析头像
        Element avatar = plc.getElementsByClass("avatar").first();
        post.setAvatarUrl(avatar.child(0).absUrl("src"));

        String authi = plc.getElementsByClass("authi").first().html();
        Element message = plc.getElementsByClass("message").first();
        post.setContent(message.html().trim());
        // // 解析头像
        // // Element avatar = plc.getElementsByClass("avatar").first();
        // Element avatar = plc.child(0);
        // post.setAvatarUrl(avatar.child(0).absUrl("src"));
        //
        // // Element message = plc.getElementsByClass("message").first();
        // Element display = plc.child(1);
        // String authi = display.child(0).html();
        // Element message = display.child(1);
        // post.setContent(message.html().trim());

        try { // 主贴没有replyUrl
          String replyUrl = plc.getElementsByClass("replybtn").first().child(0).absUrl("href");
          post.setReplyUrl(replyUrl);
        } catch (Exception e) {
        }

        Elements img_list = plc.getElementsByClass("img_list");
        if (img_list != null && !img_list.isEmpty()) {
          String imgList = img_list.first().html();
          post.setImgList(imgList);
        } else { // 单张图片附件时
          Elements img_one = plc.getElementsByClass("img_one");
          if (img_one != null && !img_one.isEmpty()) {
            String imgOne = img_one.first().html();
            post.setImgList(imgOne);
          }
        }
        post.setAuthi(authi);

        posts.add(post);
      } catch (Exception e) {
      }
      LogMessage.i("parsePostList", "解析时间:" + (System.currentTimeMillis() - s));
    }
    return posts;
  }
예제 #17
0
  /**
   * get the movie from title from an element
   *
   * @param elem
   * @return
   */
  private static String getTitle(Element elem) {
    String html = elem.html();
    String title = "";
    int start = html.indexOf("</a>");

    while (html.charAt(--start) != '>') {
      title = html.charAt(start) + title;
    }
    // System.out.println("title:"+title);
    return title;
  }
예제 #18
0
 private void getChildElement(Element parentElement, Integer level) {
   parentElement.html(deleteComent(parentElement.html()));
   // System.out.println("key:"+(level+","+parentElement.hashCode())+",value:"+parentElement.html());
   if (parentElement.children().size() > 0) {
     level += 1;
     for (int i = 0; i < parentElement.children().size(); i++) {
       if (("ul".equals(parentElement.tagName().toLowerCase()))
           || ("table".equals(parentElement.tagName().toLowerCase()))) { // 整体标签
         String html = parentElement.html().replaceAll(" ", "").replaceAll(" ", ""); // 去中英文空格
         if (html.contains("首页") || parentElement.id().contains("nav")) {
           //
           // System.out.println("----------------------首页Start-----------------------------");
           Elements links = parentElement.select("a");
           for (Element ele : links) {
             if (topMenumap.get(level + "," + ele.hashCode()) == null) {
               topMenumap.put(level + "," + ele.hashCode(), ele);
               // System.out.println(level + "," + ele.hashCode() + ",---------------" +
               // ele.html());
               // System.out.println("a:" +
               // ele.attr("abs:href") + ",文本:" + ele.text());
             }
           }
           //						 System.out.println("----------------------首页End-----------------------------");
         } else {
           map.put(level + "," + parentElement.hashCode(), parentElement);
         }
       } else {
         getChildElement(parentElement.child(i), level);
       }
     }
   } else {
     if ("script".equals(parentElement.tagName().toLowerCase())) {
       return;
     }
     if (StringUtils.isNotEmpty(parentElement.html())) {
       level += 1;
       map.put(level + "," + parentElement.hashCode(), parentElement);
     }
   }
 }
예제 #19
0
  @Test
  public void testPrependNewHtml() {
    Document doc = Jsoup.parse("<div id=1><p>Hello</p></div>");
    Element div = doc.getElementById("1");
    div.prepend("<p>there</p><p>now</p>");
    assertEquals("<p>there</p><p>now</p><p>Hello</p>", TextUtil.stripNewlines(div.html()));

    // check sibling index (reindexChildren):
    Elements ps = doc.select("p");
    for (int i = 0; i < ps.size(); i++) {
      assertEquals(i, ps.get(i).siblingIndex);
    }
  }
예제 #20
0
  protected String getColumnContent(int column, boolean stripHtml) {
    try {
      currentColumn = column;
      Element element = getElement(column, "small");
      if (stripHtml) return element.text().trim();
      else return element.html().trim();
    } catch (Exception e) {
      Log.d(getClass().getName(), e.getMessage());
      e.printStackTrace();
    }

    return "";
  }
  private void parseEmoji(int count, JsonWriter jsonWriter) throws IOException {
    // 从html文件读取网页字符串
    InputStream inputStream = getResources().getAssets().open("html/emoji-code-" + count + ".html");
    // 根据输入流获取document对象
    Document document =
        Jsoup.parse(inputStream, "utf-8", "http://apps.timwhitlock.info/emoji/tables/unicode");

    // 开始解析一层层剥离对象
    Element h3 = document.getElementsByClass("category").get(0);
    // 开始第一个分类
    jsonWriter.name(KEY_EMOJI_CATEGORY).value(h3.html());
    jsonWriter.name(KEY_EMOJI_ARRAY);
    jsonWriter.beginArray();

    Elements trs =
        document.getElementsByClass("table-bordered").get(0).child(0).getElementsByTag("tr");

    for (int i = 0; i < trs.size(); i++) {
      // 开始记录emoji
      jsonWriter.beginObject();

      // 获取emoji对应的值
      String codeUnicode = trs.get(i).child(7).child(0).html();
      String codeUtf8 = trs.get(i).child(8).html();
      String name = trs.get(i).child(9).html();

      jsonWriter
          .name(KEY_EMOJI_ICON_URL_ANDROID)
          .value(
              "http://apps.timwhitlock.info/static/images/emoji/emoji-android/"
                  + codeUnicode
                  + ".png");
      jsonWriter
          .name(KEY_EMOJI_ICON_URL_APPLE)
          .value(
              "http://apps.timwhitlock.info/static/images/emoji/emoji-apple/"
                  + codeUnicode
                  + ".png");
      jsonWriter.name(KEY_EMOJI_DESCRIPTION).value(name);
      jsonWriter.name(KEY_EMOJI_UNICODE).value(codeUnicode);
      jsonWriter.name(KEY_EMOJI_UTF8).value(codeUtf8);

      // 结束一个 emoji JSONObject
      jsonWriter.endObject();
    }
    // 结束一个 emoji category JSONArray
    jsonWriter.endArray();
    // 结束一个 emoji category JSONObject
    jsonWriter.endObject();
    inputStream.close();
  }
예제 #22
0
 public boolean contentLongText(String selector, String text) {
   String shrinkText =
       Arrays.stream(text.split("\n")).map(v -> v.trim()).collect(Collectors.joining());
   Elements es = document().select(selector);
   for (Element e : es) {
     String fullText =
         Arrays.stream(e.html().trim().split("<(br|BR|Br|bR) */?>"))
             .map(v -> Arrays.stream(v.trim().split("\n")).collect(Collectors.joining()))
             .collect(Collectors.joining(""));
     if (fullText.equals(shrinkText)) return wrap(true);
   }
   addViolation(String.format("入力されたはずのテキストがDOM要素 '%s' に表示されていません", selector));
   return wrap(false);
 }
예제 #23
0
 @Override
 protected void select(SSPHandler sspHandler) {
   ElementSelector es = new SimpleElementSelector(FLASH_CONTENT_CSS_LIKE_QUERY);
   es.selectElements(sspHandler, decidableElements);
   es = new SimpleElementSelector(SCRIPT_ELEMENT);
   es.selectElements(sspHandler, notDecidableElements);
   Iterator<Element> iter = notDecidableElements.get().iterator();
   while (iter.hasNext()) {
     Element script = iter.next();
     if (!StringUtils.contains(script.html(), SWF_EXT)) {
       iter.remove();
     }
   }
 }
  public void download(Connection aInConnection, Collection<Image> images) throws IOException {
    aInConnection.url(url);
    Document lDocument = aInConnection.get();
    Element lMain = lDocument.getElementById("main");
    Elements lContents = lMain.getElementsByClass("content");

    if (lContents.size() == 1) {
      StringBuilder sb = new StringBuilder();
      Element lContent = lContents.first();

      collectImages(lContent, images);

      Elements lLightboxElements = lContent.getElementsByClass("lightbox");
      for (Element lLightboxElement : lLightboxElements) {
        Collection<Node> lImageNodes = extractImageNodes(lLightboxElement);

        Element lParent = lLightboxElement.parent();
        int i = lLightboxElement.siblingIndex();
        lParent.insertChildren(i, lImageNodes);
        lLightboxElement.remove();
      }

      Elements lChildElements = lContent.children();
      for (Element lChildElement : lChildElements) {
        if (lChildElement.hasClass("clear")) {
          // no more post content
          break;
        }

        if (title == null && lChildElement.tagName().equals("h1")) {
          // the first h1 header is the title
          title = lChildElement.html();
        } else {
          if (excerpt == null && lChildElement.tagName().equals("p")) {
            excerpt = lChildElement.text();
          }
          String lStr = lChildElement.toString();
          sb.append(lStr);
        }
      }

      content = sb.toString();

      Elements lDateElements = lContent.getElementsByClass("date");
      String lHunDate = lDateElements.first().html();
      date = new PostDate(lHunDate);
    } else {
      System.out.println("More than one content in main section of post page " + toString());
    }
  }
  public static String getHighlightedText_math(
      String text, String color, String apiElement) // for math after merging with recodoc
      {
    String highlightBeginning = "<SPAN style=\"BACKGROUND-COLOR: " + color + "\">";
    String highlightEnding = "</SPAN>";

    Document doc = Jsoup.parse(text);
    Elements apiElements = doc.select("clt[api=" + apiElement + "]");

    for (Element apielement : apiElements) {
      Document tmp = new Document("");
      String[] apis = apielement.text().split("\\.");
      if (apis.length == 2)
        apielement.html(highlightBeginning + apis[0] + highlightEnding + "." + apis[1]);
      else apielement.wrap(highlightBeginning);
    }

    // highlight code snippet
    Elements codesnippets = doc.getElementsByTag("pre");
    for (Element codesnippet : codesnippets) {
      String html = codesnippet.html();
      Pattern apielementPattern = Pattern.compile("(?<=\\W)" + apiElement + "(?=\\W)");
      Matcher matcher = apielementPattern.matcher(html);

      codesnippet.html(matcher.replaceAll(highlightBeginning + apiElement + highlightEnding));
    }

    // remove clt tags for display
    Elements clts = doc.getElementsByTag("clt");
    for (Element clt : clts) {
      clt.unwrap();
      //			clt.replaceWith(new TextNode(clt.text(), ""));
    }

    return doc.html();
  }
예제 #26
0
 public static Result text(Element elsPar, String jsoupSelector, boolean exitIfNotFound) {
   Result res = new Result();
   Elements els = elsPar.select(jsoupSelector);
   if (els == null || els.size() != 1) {
     res.setRc(RC.NOT_FOUND);
     if (exitIfNotFound) {
       res.setRc(RC.ERROR);
       res.setErrorMessage(
           "jsoup selector on elements does not match: " + jsoupSelector + "\n" + elsPar.html());
       log.error(res.getErrorMessage());
     }
     return res.setContinua(false).setRetStr("");
   }
   return res.setRetStr(els.get(0).text());
 }
예제 #27
0
 @Override
 protected Boolean doInBackground(String... params) {
   try {
     Document doc = Jsoup.connect(params[0]).get();
     Element body = doc.body();
     Elements titleEs = body.select("td.title");
     Elements subTitleEs = body.select("td.subtext");
     int index = 1;
     if (!titleEs.isEmpty()) {
       if (mType == TYPE_REFRESH && mNews.size() > 0) {
         mNews.clear();
       }
       Iterator<Element> iterator = titleEs.iterator();
       Iterator<Element> subIt = subTitleEs.iterator();
       NewEntity entity = null;
       User user = null;
       while (iterator.hasNext()) {
         Element e = iterator.next();
         if (index % 2 == 0) {
           Element subE = subIt.next();
           Elements aTag = e.select("a");
           Elements spanTag = e.select("span.comhead");
           Elements subEa = subE.select("a");
           user = new User();
           user.setId(subEa.get(0).text());
           entity =
               new NewEntity(
                   aTag.get(0).attr("href"),
                   aTag.get(0).text(),
                   spanTag.isEmpty() ? null : spanTag.get(0).text(),
                   subE.html());
           entity.setDiscussUrl(subEa.get(1).attr("href"));
           // Log.i(LOG_TAG, entity.toString());
           mNews.add(entity);
         }
         index++;
       }
     }
     Elements more = doc.getElementsByAttributeValueStarting("href", "/x?fnid=");
     if (!more.isEmpty()) {
       mMoreURLPath = more.get(1).attr("href");
     }
     return true;
   } catch (IOException e) {
     Log.e(LOG_TAG, "", e);
     return false;
   }
 }
예제 #28
0
  public Map<String, String> attempt(Element element) {
    Map<String, String> attributes = new HashMap<String, String>();
    for (Entry<String, Matcher> entry : matchers.entrySet()) {
      if (entry.getValue().test(element)) {
        attributes.put(entry.getKey(), decode(element.text()));
      }
    }

    for (Entry<String, Matcher> entry : textMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        Node textNode = element.nextSibling();
        if (null != textNode) {
          attributes.put(entry.getKey(), decode(textNode.outerHtml()));
        }
      }
    }

    for (Entry<String, Matcher> entry : subtextMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        TextNode textNode = element.textNodes().get(0);
        if (null != textNode) {
          attributes.put(entry.getKey(), decode(textNode.outerHtml()));
        }
      }
    }

    for (Entry<String, Matcher> entry : htmlMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        attributes.put(entry.getKey(), element.html());
      }
    }

    for (Entry<String, Matcher> entry : ptextMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        attributes.put(entry.getKey(), plainTextFormatter.getPlainText(element));
      }
    }

    for (Entry<String, Object[]> entry : attrMatchers.entrySet()) {
      Object[] objects = entry.getValue();
      Matcher matcher = (Matcher) objects[0];
      String attr = (String) objects[1];
      if (matcher.test(element)) {
        attributes.put(entry.getKey(), element.attr(attr));
      }
    }
    return attributes;
  }
  /** TODO */
  public FrequencyImpl(String html) {
    Document doc = Jsoup.parse(html);

    Element table = doc.select("table.philologic_table").first();
    if (null != table) {
      for (Element row : table.select("tr.freq_row")) {
        Element count = row.select("td.freq_value").first();
        Element link = row.select("a[href]").first();

        _links.add(
            new FrequencyLinkImpl()
                .setCount((null == count) ? 0 : Integer.parseInt(count.text()))
                .setLink((null == link) ? "" : link.attr("href"))
                .setText((null == link) ? "" : link.html()));
      }
    }
  }
예제 #30
0
  @Test
  public void testSetHtmlTitle() {
    Document doc = Jsoup.parse("<html><head id=2><title id=1></title></head></html>");

    Element title = doc.getElementById("1");
    title.html("good");
    assertEquals("good", title.html());
    title.html("<i>bad</i>");
    assertEquals("&lt;i&gt;bad&lt;/i&gt;", title.html());

    Element head = doc.getElementById("2");
    head.html("<title><i>bad</i></title>");
    assertEquals("<title>&lt;i&gt;bad&lt;/i&gt;</title>", head.html());
  }