private void findRecommendations( @NotNull Document doc, @NotNull BilingualQueryResultBuilder resultBuilder) { // Determine all candidate nodes: Elements alternativeNodes = doc.select("div.cc > p > *"); Language currentLanguage = null; for (Element node : alternativeNodes) { // If the next node is a flagicon, try to determine the language for the next entries from the // class name if (node.tagName().equals("span") && node.hasClass("flagicon")) { Set<String> classNames = node.classNames(); classNames.remove("flagicon"); for (String className : classNames) { Language candidate = Language.getExistingLanguageById(className); if (candidate != null) { currentLanguage = candidate; break; } } } else if (node.tagName().equals("a")) { String recommendationText = node.text(); DictionaryObjectBuilder objectBuilder = ImmutableDictionaryObject.builder(); objectBuilder.setLanguage(currentLanguage).setGeneralForm(recommendationText); resultBuilder.addSimilarRecommendation(objectBuilder.build()); } } }
public List<AreaVO> parseMessage(String text, int pid) { Document doc = Jsoup.parse(text); Element body = doc.body(); List<AreaVO> areas = new ArrayList<AreaVO>(); Elements divs = body.getElementsByClass("subarea"); if (divs.size() > 0) { Element div = divs.get(0); Elements childs = div.children(); String letter = ""; for (int i = 1; i < childs.size(); i++) { Element child = childs.get(i); if ("b".equals(child.tagName())) { letter = child.text(); continue; } if ("a".equals(child.tagName())) { AreaVO area = new AreaVO(); area.setLetter(letter); area.setName(child.text()); area.setOrderIdx(index); area.setPid(pid); String href = child.attr("href"); String pinyin = href.substring(7, href.lastIndexOf("/")); area.setPinyin(pinyin); index++; System.out.println(area.toString()); areas.add(area); } } } return areas; }
public static void main(String[] args) throws IOException { // Validate.isTrue(args.length == 1, "usage: supply url to fetch"); // String url = args[0]; // String url = "http://www.hao123.com"; String url = "http://www.iteye.com/login"; print("Fetching %s...", url); Document doc = Jsoup.connect(url).get(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); Elements imports = doc.select("link[href]"); print("\nMedia: (%d)", media.size()); for (Element src : media) { if (src.tagName().equals("img")) print( " * %s: <%s> %sx%s (%s)", src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"), trim(src.attr("alt"), 20)); else print(" * %s: <%s>", src.tagName(), src.attr("abs:src")); } print("\nImports: (%d)", imports.size()); for (Element link : imports) { print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel")); } print("\nLinks: (%d)", links.size()); for (Element link : links) { print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35)); } }
/** Produce predictable html (attributes in alphabetical order), always include close tags */ private String elementToHtml(Element producedElem, StringBuilder sb) { ArrayList<String> names = new ArrayList<String>(); for (Attribute a : producedElem.attributes().asList()) { names.add(a.getKey()); } Collections.sort(names); sb.append("<" + producedElem.tagName() + ""); for (String attrName : names) { sb.append(" ") .append(attrName) .append("=") .append("\'") .append(producedElem.attr(attrName)) .append("\'"); } sb.append(">"); for (Node child : producedElem.childNodes()) { if (child instanceof Element) { elementToHtml((Element) child, sb); } else if (child instanceof TextNode) { String text = ((TextNode) child).text(); sb.append(text.trim()); } } sb.append("</").append(producedElem.tagName()).append(">"); return sb.toString(); }
/** * Reads an Item from a design and inserts it into the data source. Recursively handles any * children of the item as well. * * @since 7.5.0 * @param node an element representing the item (tree node). * @param selected A set accumulating selected items. If the item that is read is marked as * selected, its item id should be added to this set. * @param context the DesignContext instance used in parsing * @return the item id of the new item * @throws DesignException if the tag name of the {@code node} element is not {@code node}. */ @Override protected String readItem(Element node, Set<String> selected, DesignContext context) { if (!"node".equals(node.tagName())) { throw new DesignException( "Unrecognized child element in " + getClass().getSimpleName() + ": " + node.tagName()); } String itemId = node.attr("text"); addItem(itemId); if (node.hasAttr("icon")) { Resource icon = DesignAttributeHandler.readAttribute("icon", node.attributes(), Resource.class); setItemIcon(itemId, icon); } if (node.hasAttr("selected")) { selected.add(itemId); } for (Element child : node.children()) { String childItemId = readItem(child, selected, context); setParent(childItemId, itemId); } return itemId; }
private static Node toNode(Element aInElement) { int i = aInElement.siblingIndex(); Node lNode = aInElement.parent().childNode(i); if (!lNode.nodeName().equals(aInElement.tagName())) { throw new RuntimeException(lNode.nodeName() + " != " + aInElement.tagName()); } return lNode; }
@Test public void parsesSimpleDocument() { String html = "<html><head><title>First!</title></head><body><p>First post! <img src=\"foo.png\" /></p></body></html>"; Document doc = Jsoup.parse(html); // need a better way to verify these: Element p = doc.child(1).child(0); assertEquals("p", p.tagName()); Element img = p.child(0); assertEquals("foo.png", img.attr("src")); assertEquals("img", img.tagName()); }
@Test public void testGetElementById() { Document doc = Jsoup.parse(reference); Element div = doc.getElementById("div1"); assertEquals("div1", div.id()); assertNull(doc.getElementById("none")); Document doc2 = Jsoup.parse("<div id=1><div id=2><p>Hello <span id=2>world!</span></p></div></div>"); Element div2 = doc2.getElementById("2"); assertEquals("div", div2.tagName()); // not the span Element span = div2.child(0).getElementById("2"); // called from <p> context should be span assertEquals("span", span.tagName()); }
private static String cleanHtml(final Node node) { if (node instanceof Element) { Element element = ((Element) node); StringBuilder accum = new StringBuilder(); accum.append("<").append(element.tagName()); for (Attribute attribute : element.attributes()) { if (!(attribute.getKey().startsWith("_"))) { accum.append(" "); accum.append(attribute.getKey()); accum.append("=\""); accum.append(attribute.getValue()); accum.append('"'); } } if (element.childNodes().isEmpty() && element.tag().isEmpty()) { accum.append(" />"); } else { accum.append(">"); for (Node child : element.childNodes()) accum.append(cleanHtml(child)); accum.append("</").append(element.tagName()).append(">"); } return accum.toString(); } else if (node instanceof TextNode) { return ((TextNode) node).getWholeText(); } else if (node instanceof XmlDeclaration) { // HACK if (node.childNodes().isEmpty()) { return ""; } return node.outerHtml(); } else if (node instanceof Comment) { // HACK: elide comments for now. return ""; } else if (node instanceof DataNode && node.childNodes().isEmpty()) { // No child nodes are defined but we have to handle content if such exists, example // <script language="JavaScript">var a = { name: "${user.name}"}</script> String content = node.attr("data"); if (Strings.empty(content)) { return ""; } return content; } else { return node.outerHtml(); } }
public void download(Connection aInConnection, Collection<Image> images) throws IOException { aInConnection.url(url); Document lDocument = aInConnection.get(); Element lMain = lDocument.getElementById("main"); Elements lContents = lMain.getElementsByClass("content"); if (lContents.size() == 1) { StringBuilder sb = new StringBuilder(); Element lContent = lContents.first(); collectImages(lContent, images); Elements lLightboxElements = lContent.getElementsByClass("lightbox"); for (Element lLightboxElement : lLightboxElements) { Collection<Node> lImageNodes = extractImageNodes(lLightboxElement); Element lParent = lLightboxElement.parent(); int i = lLightboxElement.siblingIndex(); lParent.insertChildren(i, lImageNodes); lLightboxElement.remove(); } Elements lChildElements = lContent.children(); for (Element lChildElement : lChildElements) { if (lChildElement.hasClass("clear")) { // no more post content break; } if (title == null && lChildElement.tagName().equals("h1")) { // the first h1 header is the title title = lChildElement.html(); } else { if (excerpt == null && lChildElement.tagName().equals("p")) { excerpt = lChildElement.text(); } String lStr = lChildElement.toString(); sb.append(lStr); } } content = sb.toString(); Elements lDateElements = lContent.getElementsByClass("date"); String lHunDate = lDateElements.first().html(); date = new PostDate(lHunDate); } else { System.out.println("More than one content in main section of post page " + toString()); } }
public static Pupil getSelectedPupil(Document doc) throws ParseException { boolean found = false; Pupil p, selectedP = null; Elements pupilSelectors = doc.getElementsByAttributeValue("id", "ctl00_topMenu_pupil_drdPupils"); for (Element pupilSelector : pupilSelectors) { Elements pupils = pupilSelector.getAllElements(); for (Element pupil : pupils) { if (pupil.tagName().equals("option")) { String value = pupil.attr("value"); found = true; if ((p = Pupil.getByFormId(value)) == null) { p = new Pupil(pupil.text(), value); long rowId = p.insert(); if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " Pupil.insert() = " + rowId); } if (pupil.hasAttr("selected") && pupil.attr("selected").equals("selected")) { selectedP = p; } } } } if (!found) { if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " Alternative fields found!"); Element userName = doc.getElementsByClass("user-name").first(); Element userId = doc.getElementsByAttributeValue("id", "ctl00_topMenu_tbUserId").first(); String name = userName.text(); String id = userId.attr("value"); if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " name=" + name + " id=" + id); if ((p = Pupil.getByFormId(id)) == null) { p = new Pupil(name, id); long rowId = p.insert(); if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " Pupil.insert() = " + rowId); } selectedP = p; } if (selectedP == null) throw new ParseException("Pupils not found", 0); return selectedP; }
/** * @param theLinks * @throws IOException */ public void getSecondLinks(ArrayList<String> theLinks) throws IOException { String temp = null; Document doc = null; boolean flag; for (String sLink : theLinks) { if (sLink.endsWith(".asx") == true || sLink.endsWith(".swf") == true) { stationLinks2.add(sLink); print("Written to file: %s", sLink); } else { doc = parseUrl(sLink, 0); if (doc != null) { Elements media = doc.select("[src]"); print("Fetching %s --> ", sLink); flag = false; for (Element src : media) { if (src.tagName().equals("embed") == true) { flag = true; temp = src.attr("abs:src"); stationLinks2.add(temp); break; // link found, load next url } } // end nested for if (flag == false) { // the code has no embed tag stationLinks2.add(sLink); } } } } // end outer for writeLinksToFile(links2FileName, stationLinks2); print("Written %s to file, second links.", stationLinks2.size()); } // end method
private static void accumulateParents(Element el, Elements parents) { Element parent = el.parent(); if (parent != null && !parent.tagName().equals("#root")) { parents.add(parent); accumulateParents(parent, parents); } }
private void recurse(Element element) { ElementAction action = classifyElement(element); if (action == ElementAction.Whitespace || action == ElementAction.Sentence) { appendSpace(); } for (Node childNode : element.childNodes()) { // n.b., cdata not possible if we are coming from TagSoup. If we also handle // real xhtml by directly parsing it, then we have another story on our hands. // though we could use canonical XML to get rid of them. if (childNode instanceof TextNode && action != ElementAction.Banned) { TextNode textContent = (TextNode) childNode; String textString = textContent.text(); append(textContent, textString); } else if (childNode instanceof Element) { recurse((Element) childNode); } } if (action == ElementAction.Whitespace) { appendSpace(); } else if (action == ElementAction.Sentence) { appendPeriod(); } else if (action == ElementAction.Mark) { Mark mark = new Mark(); mark.setOffset(pcDataOffset); mark.setTag(element.tagName()); } }
public void head(Node source, int depth) { if (source instanceof Element) { Element sourceEl = (Element) source; if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs ElementMeta meta = createSafeElement(sourceEl); Element destChild = meta.el; destination.appendChild(destChild); numDiscarded += meta.numAttribsDiscarded; destination = destChild; } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. numDiscarded++; } } else if (source instanceof TextNode) { TextNode sourceText = (TextNode) source; TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri()); destination.appendChild(destText); } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) { DataNode sourceData = (DataNode) source; DataNode destData = new DataNode(sourceData.getWholeData(), source.baseUri()); destination.appendChild(destData); } else { // else, we don't care about comments, xml proc instructions, etc numDiscarded++; } }
private boolean isBlock(Node n) { boolean block = false; if (n != null && n instanceof Element) { Element el = (Element) n; block = el.isBlock() || el.tagName().equals("br"); } return block; }
private static void parseTitle(Topic.Builder topicBuilder, Element ele) { ele = ele.select(".item_title > a").get(0); Preconditions.checkState(ele.tagName().equals("a")); String url = ele.attr("href"); topicBuilder.setId(Topic.getIdFromUrl(url)); topicBuilder.setTitle(ele.html()); }
@Test public void parsesUnterminatedAttribute() { String h1 = "<p id=\"foo"; Document doc = Jsoup.parse(h1); Element p = doc.getElementById("foo"); assertNotNull(p); assertEquals("p", p.tagName()); }
static void parseMember(Topic.Builder builder, Element ele) { final Member.Builder memberBuilder = new Member.Builder(); // get member url ele = ele.child(0); Preconditions.checkState(ele.tagName().equals("a")); final String url = ele.attr("href"); memberBuilder.setUsername(Member.getNameFromUrl(url)); // get member avatar final Avatar.Builder avatarBuilder = new Avatar.Builder(); ele = ele.child(0); Preconditions.checkState(ele.tagName().equals("img")); avatarBuilder.setUrl(ele.attr("src")); memberBuilder.setAvatar(avatarBuilder.createAvatar()); builder.setMember(memberBuilder.createMember()); }
public static Week getSelectedWeek(Document doc, Schedule s) throws ParseException { boolean found = false; Week selectedW = null; SimpleDateFormat f = new SimpleDateFormat("yyyy dd.MM", Locale.ENGLISH); f.setTimeZone(TimeZone.getTimeZone("Europe/Moscow")); Elements weekSelectors = doc.getElementsByAttributeValue("id", "ctl00_body_week_drdWeeks"); for (Element weekSelector : weekSelectors) { Elements weeks = weekSelector.getAllElements(); for (Element week : weeks) { if (week.tagName().equals("option")) { String value = week.text(); Week w; found = true; if ((w = s.getWeek(week.attr("value"))) == null) { w = new Week(); String wBegin = value.substring(0, value.indexOf("-") - 1); String wMonth = wBegin.substring(wBegin.indexOf(".") + 1, wBegin.length()); String year; if (Integer.parseInt(wMonth) > 7) { year = s.getFormText().substring(0, s.getFormText().indexOf("-") - 1); } else { year = s.getFormText() .substring(s.getFormText().indexOf("-") + 2, s.getFormText().length()); } w.setStart(f.parse(year + " " + wBegin)); w.setFormText(week.text()); w.setFormId(week.attr("value")); s.addWeek(w); } if (week.hasAttr("selected") && week.attr("selected").equals("selected")) { selectedW = w; long u = w.setLoaded().update(); if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " Week.update() = " + u); } } } } if (!found) throw new ParseException("Weeks not found", 0); return selectedW; }
// recursively processes the element to replace <br>'s with \n private void fixLineBreaks(Element el) { for (final Element e : el.children()) { if (e.tagName().equals("br")) { e.before("\n"); e.remove(); } else { fixLineBreaks(e); } } }
/** * Complement of HtmlTemplateCompiler#lexicalClimb(). This method pops off the stack of lexical * scopes when we're done processing a sitebricks widget. */ private void lexicalDescend(PageCompilingContext pc, Element element, boolean shouldPopScope) { // pop form if ("form".equals(element.tagName())) pc.form = null; // pop compiler if the scope ends if (shouldPopScope) { pc.lexicalScopes.pop(); } }
/** * Determines whether an element is not focusable using keys. * * <p>TODO(jharty): We need to determine which elements are generally focusable (where focus is * disabled by setting tabindex to a negative number) and which ones generally are NOT focusable, * but may become focusable by setting a positive tabindex. TODO(jharty): This could move to * OnClickIsFocusable? * * @param element the element to test * @return true when the element is not focusable, else false. */ static boolean notFocusable(Element element) { List<String> focusable = Arrays.asList(new String[] {"a", "button", "input", "select", "textarea"}); if (focusable.contains(element.tagName())) { if (element.hasAttr(TAB_INDEX) && -1 >= Integer.parseInt(element.attr(TAB_INDEX))) { return true; } } return false; }
@Test public void parsesRoughAttributes() { String html = "<html><head><title>First!</title></head><body><p class=\"foo > bar\">First post! <img src=\"foo.png\" /></p></body></html>"; Document doc = Jsoup.parse(html); // need a better way to verify these: Element p = doc.child(1).child(0); assertEquals("p", p.tagName()); assertEquals("foo > bar", p.attr("class")); }
public Map doProcess() { if (!isInit) { init(); } Iterator<String> it = map.keySet().iterator(); while (it.hasNext()) { String key = it.next(); Element val = map.get(key); // 如果是ul或者table,用相应的组件渲染 if (("ul".equals(val.tagName().toLowerCase())) || ("table".equals(val.tagName().toLowerCase()))) { System.out.println("------------------列表开始-----------------------------"); Elements links = val.select("a"); for (Element ele : links) { System.out.println("a:" + ele.attr("abs:href") + ",文本:" + ele.text()); } System.out.println("------------------列表结束-----------------------------"); } else { System.out.println("------------------非列表-----------------------------"); if ("a".equals(val.tagName().toLowerCase())) { System.out.println("a:" + val.attr("abs:href") + ",文本:" + val.text()); } else { if ("span".equals(val.tagName().toLowerCase())) { System.out.println("父容器:" + val.parent().tagName()); } System.out.println("标签:" + val.tagName() + ",html:" + val.html()); } } } System.out.println("---------------top menu---------------------"); it = topMenumap.keySet().iterator(); while (it.hasNext()) { String key = (String) it.next(); Element val = topMenumap.get(key); Elements links = val.select("a"); for (Element ele : links) { System.out.println("a:" + ele.attr("abs:href") + ",文本:" + ele.text()); } } return map; }
private void getChildElement(Element parentElement, Integer level) { parentElement.html(deleteComent(parentElement.html())); // System.out.println("key:"+(level+","+parentElement.hashCode())+",value:"+parentElement.html()); if (parentElement.children().size() > 0) { level += 1; for (int i = 0; i < parentElement.children().size(); i++) { if (("ul".equals(parentElement.tagName().toLowerCase())) || ("table".equals(parentElement.tagName().toLowerCase()))) { // 整体标签 String html = parentElement.html().replaceAll(" ", "").replaceAll(" ", ""); // 去中英文空格 if (html.contains("首页") || parentElement.id().contains("nav")) { // // System.out.println("----------------------首页Start-----------------------------"); Elements links = parentElement.select("a"); for (Element ele : links) { if (topMenumap.get(level + "," + ele.hashCode()) == null) { topMenumap.put(level + "," + ele.hashCode(), ele); // System.out.println(level + "," + ele.hashCode() + ",---------------" + // ele.html()); // System.out.println("a:" + // ele.attr("abs:href") + ",文本:" + ele.text()); } } // System.out.println("----------------------首页End-----------------------------"); } else { map.put(level + "," + parentElement.hashCode(), parentElement); } } else { getChildElement(parentElement.child(i), level); } } } else { if ("script".equals(parentElement.tagName().toLowerCase())) { return; } if (StringUtils.isNotEmpty(parentElement.html())) { level += 1; map.put(level + "," + parentElement.hashCode(), parentElement); } } }
public static String printNode(Element root, int indentation) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < indentation; i++) { sb.append(' '); } sb.append(root.tagName()); sb.append(":"); sb.append(root.ownText()); sb.append("\n"); for (Element el : root.children()) { sb.append(printNode(el, indentation + 1)); sb.append("\n"); } return sb.toString(); }
void appendTextSkipHidden(Element e, StringBuilder accum) { for (Node child : e.childNodes()) { if (unlikely(child)) continue; if (child instanceof TextNode) { TextNode textNode = (TextNode) child; String txt = textNode.text(); accum.append(txt); } else if (child instanceof Element) { Element element = (Element) child; if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum)) accum.append(" "); else if (element.tagName().equals("br")) accum.append(" "); appendTextSkipHidden(element, accum); } } }
private ElementMeta createSafeElement(Element sourceEl) { String sourceTag = sourceEl.tagName(); Attributes destAttrs = new Attributes(); Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs); int numDiscarded = 0; Attributes sourceAttrs = sourceEl.attributes(); for (Attribute sourceAttr : sourceAttrs) { if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) destAttrs.put(sourceAttr); else numDiscarded++; } Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag); destAttrs.addAll(enforcedAttrs); return new ElementMeta(dest, numDiscarded); }
public static GradeSemester getActiveGradeSemester(Document doc, Schedule sch) throws ParseException { boolean found = false; GradeSemester selG = null; SimpleDateFormat fmt = new SimpleDateFormat("dd.MM.yyyy", Locale.ENGLISH); fmt.setTimeZone(TimeZone.getTimeZone("Europe/Moscow")); Elements semesterSelectors = doc.getElementsByAttributeValue("id", "ctl00_body_drdTerms"); for (Element semesterSelector : semesterSelectors) { Elements semesters = semesterSelector.getAllElements(); for (Element semester : semesters) { if (semester.tagName().equals("option")) { String value = semester.text(); GradeSemester sem; found = true; if ((sem = sch.getSemester(semester.attr("value"))) == null) { sem = new GradeSemester(); sem.setStart(fmt.parse(value.substring(12, value.indexOf("-") - 1))); sem.setStop(fmt.parse(value.substring(value.indexOf("-") + 2, value.length() - 2))); sem.setFormText(semester.text()); sem.setFormId(semester.attr("value")); sch.addSemester(sem); } if (semester.hasAttr("selected") && semester.attr("selected").equals("selected")) { long u = sem.setLoaded().update(); selG = sem; if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " Semester.update() = " + u); } } } } if (!found) throw new ParseException("Semesters not found", 0); return selG; }