@Test public void insertChildrenAtPosition() { Document doc = Jsoup.parse( "<div id=1>Text1 <p>One</p> Text2 <p>Two</p></div><div id=2>Text3 <p>Three</p></div>"); Element div1 = doc.select("div").get(0); Elements p1s = div1.select("p"); Element div2 = doc.select("div").get(1); assertEquals(2, div2.childNodeSize()); div2.insertChildren(-1, p1s); assertEquals(2, div1.childNodeSize()); // moved two out assertEquals(4, div2.childNodeSize()); assertEquals(3, p1s.get(1).siblingIndex()); // should be last List<Node> els = new ArrayList<Node>(); Element el1 = new Element(Tag.valueOf("span"), "").text("Span1"); Element el2 = new Element(Tag.valueOf("span"), "").text("Span2"); TextNode tn1 = new TextNode("Text4", ""); els.add(el1); els.add(el2); els.add(tn1); assertNull(el1.parent()); div2.insertChildren(-2, els); assertEquals(div2, el1.parent()); assertEquals(7, div2.childNodeSize()); assertEquals(3, el1.siblingIndex()); assertEquals(4, el2.siblingIndex()); assertEquals(5, tn1.siblingIndex()); }
private void recurse(final Element element, final Map<String, Object> values, final int depth) { final Tag tag = element.tag(); final Set<String> classes = element.classNames(); final String link = element.attr("href"); final Object content = extractChildContent(element); if (!classes.isEmpty()) { removeEmpty(classes); // toplevel classes define type if (tag.isBlock()) { if (depth == 0) { // store type attribute values.put("type", classes); for (final Element child : element.children()) { recurse(child, values, depth + 1); } } else { final Map<String, Object> childMap = new LinkedHashMap<>(); values.put(classes.iterator().next(), childMap); if (content != null) { childMap.put("name", content); } for (final Element child : element.children()) { recurse(child, childMap, depth + 1); } } } else if (tag.isInline()) { // extract href and store as URL if (classes.contains("url") && StringUtils.isNotBlank(link)) { values.put("url", link); classes.remove("url"); } if (content != null) { for (final String type : classes) { values.put(type, content); } } } } }
@Test public void testHashcodeIsStableWithContentChanges() { Element root = new Element(Tag.valueOf("root"), ""); HashSet<Element> set = new HashSet<Element>(); // Add root node: set.add(root); root.appendChild(new Element(Tag.valueOf("a"), "")); assertTrue(set.contains(root)); }
void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { if (accum.length() > 0 && out.prettyPrint() && (tag.formatAsBlock() || (parent() != null && parent().tag().formatAsBlock()) || out.outline())) indent(accum, depth, out); accum.append("<").append(tagName()); attributes.html(accum, out); if (childNodes.isEmpty() && tag.isSelfClosing()) accum.append(" />"); else accum.append(">"); }
void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) { if (!(childNodes.isEmpty() && tag.isSelfClosing())) { if (out.prettyPrint() && (!childNodes.isEmpty() && (tag.formatAsBlock() || (out.outline() && (childNodes.size() > 1 || (childNodes.size() == 1 && !(childNodes.get(0) instanceof TextNode))))))) indent(accum, depth, out); accum.append("</").append(tagName()).append(">"); } }
private static Count modify(Element e, Count c) { List<Node> o = e.childNodes(); if (o.size() == 0 && e.textNodes().size() == 0) return new Count(c.getCount(), c.getPgCount()); for (Node n : o) { if (n instanceof TextNode) { TextNode nd = (TextNode) n; String[] arr = nd.text().trim().split("\\s"); String txt = ""; List<Node> nodes = new ArrayList<Node>(); int j = 0; TextNode ndTemp = new TextNode("", " "); nodes.add(j, ndTemp); for (int i = 0; i < arr.length; i++) { if (arr[i].length() > 0) c.incrementCount(); if (c.getCount() > PAGE_COUNT) { ((TextNode) nodes.get(j)).text(((TextNode) nodes.get(j)).text() + " "); j++; nodes.add(j, new Element(Tag.valueOf("pageid=" + c.getPgCount()), "")); j++; nodes.add(j, new TextNode(" " + arr[i] + " ", "")); // "<!--page id="+c.getPgCount()+ "--!>" + " " + arr[i]); // txt = txt + " " + "<!--page id="+c.getPgCount()+ "--!>" + " " + arr[i]; //<div // style='visibility:hidden'>Page="+pageCount+"</div> c.incrementPgCount(); c.setCount(0); } else { // txt = txt + " " + arr[i]; ((TextNode) nodes.get(j)).text(((TextNode) nodes.get(j)).text() + " " + arr[i]); } } if (nodes.size() > 1) { Element etemp = new Element(Tag.valueOf("span"), ""); nd.replaceWith(etemp); for (Node d : nodes) { etemp.appendChild(d); } } // nd.text(ndTemp.text()); } else if (n instanceof Element) { Count ctemp = modify((Element) n, c); c.setCount(ctemp.getCount()); c.setPgCount(ctemp.getPgCount()); } } return c; }
@Override public int hashCode() { // todo: fixup, not very useful int result = super.hashCode(); result = 31 * result + (tag != null ? tag.hashCode() : 0); return result; }
@Before public void setUp() throws IOException { initMocks(this); metaElement = new JSoupMetaElement( new Element(Tag.valueOf("meta"), "about:blank", new Attributes()), ownerDocument); }
public static void processEpub(String bookPath, String dest) throws FileNotFoundException, IOException { EpubReader reader = new EpubReader(); Book b = reader.readEpub(new FileInputStream(new File(bookPath))); String content = ""; int pagecount = 1; int tempCounter; Count cnt = new Count(0, 0); for (Resource res : b.getContents()) { content = new String(res.getData()); Document doc = Jsoup.parse(content, "UTF-8"); // http-equiv=\"content-type\" content=\"text/html; charset=utf-8\""); Element elem = new Element(Tag.valueOf("meta"), ""); elem.attr("http-equiv", "content-type"); elem.attr("content", "text/html; charset=utf-8"); doc.head().after(elem); System.out.println(doc.head().data()); Element ele = doc.body(); alterElement(ele); Count cTemp = modify(ele, cnt); cnt.setCount(cTemp.getCount()); cnt.setPgCount(cTemp.getPgCount()); doc.body().html(ele.html()); res.setData(doc.html().getBytes()); if (res.getMediaType() == null) res.setMediaType(new MediaType("html", "html")); } EpubWriter wr = new EpubWriter(); wr.write(b, new FileOutputStream(new File(dest))); }
private Element cleanupElement(Element el) { Tag newTag = null; String newText = null; if (el.nodeName().equals("img")) { newTag = Tag.valueOf("x"); newText = el.attr("src"); } if (el.nodeName().equals("em")) { newTag = Tag.valueOf("b"); } if (el.nodeName().equals("a")) { String clazz = el.attr("class"); if (clazz.equals("user")) { newTag = Tag.valueOf("x"); newText = "@" + el.text().trim(); } else if (clazz.startsWith("postimg video")) { newTag = Tag.valueOf("x"); newText = "VIDEO: " + el.attr("href") + " THUMBNAIL: " + el.select("img").attr("src"); } else if (clazz.startsWith("postimg")) { newTag = Tag.valueOf("x"); } else if (clazz.equals("post")) { newTag = Tag.valueOf("x"); } else { newTag = Tag.valueOf("x"); newText = el.attr("href"); } } if (el.nodeName().equals("div")) { newTag = Tag.valueOf("x"); } Element nel; if (newTag == null) { // el = el; nel = new Element(el.tag(), ""); // for(List<Node> children = nel.childNodes(); children.size() > 0; children = // nel.childNodes()) { // children.get(0).remove(); // } } else { nel = new Element(newTag, ""); } if (newText != null) { nel.appendChild(new TextNode(newText, "")); } else { List<Node> children = el.childNodes(); for (Node child : children) { if (child instanceof Element) { nel.appendChild(cleanupElement((Element) child)); } else { nel.appendChild(new TextNode(child.toString(), "")); } } } return nel; }
@Test public void testHasClassDomMethods() { Tag tag = Tag.valueOf("a"); Attributes attribs = new Attributes(); Element el = new Element(tag, "", attribs); attribs.put("class", "toto"); boolean hasClass = el.hasClass("toto"); assertTrue(hasClass); attribs.put("class", " toto"); hasClass = el.hasClass("toto"); assertTrue(hasClass); attribs.put("class", "toto "); hasClass = el.hasClass("toto"); assertTrue(hasClass); attribs.put("class", "\ttoto "); hasClass = el.hasClass("toto"); assertTrue(hasClass); attribs.put("class", " toto "); hasClass = el.hasClass("toto"); assertTrue(hasClass); attribs.put("class", "ab"); hasClass = el.hasClass("toto"); assertFalse(hasClass); attribs.put("class", " "); hasClass = el.hasClass("toto"); assertFalse(hasClass); attribs.put("class", "tototo"); hasClass = el.hasClass("toto"); assertFalse(hasClass); attribs.put("class", "raulpismuth "); hasClass = el.hasClass("raulpismuth"); assertTrue(hasClass); attribs.put("class", " abcd raulpismuth efgh "); hasClass = el.hasClass("raulpismuth"); assertTrue(hasClass); attribs.put("class", " abcd efgh raulpismuth"); hasClass = el.hasClass("raulpismuth"); assertTrue(hasClass); attribs.put("class", " abcd efgh raulpismuth "); hasClass = el.hasClass("raulpismuth"); assertTrue(hasClass); }
private ElementMeta createSafeElement(Element sourceEl) { String sourceTag = sourceEl.tagName(); Attributes destAttrs = new Attributes(); Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs); int numDiscarded = 0; Attributes sourceAttrs = sourceEl.attributes(); for (Attribute sourceAttr : sourceAttrs) { if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) destAttrs.put(sourceAttr); else numDiscarded++; } Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag); destAttrs.addAll(enforcedAttrs); return new ElementMeta(dest, numDiscarded); }
/** * This method is used for extraction of tables with lot of empty cells in it. It is required for * the successful extraction of most Matrix tables. */ private void fillBlankCells() { // We say: cells get a line number. If a column does not contain a cell on a certain line, add a // whitespace. // Any cell that is not filled must be empty: for (Line line : data) { int lineNumber = line.getLineNumber(); COLUMNLOOP: for (Column2 column : dataInColumns) { for (Cell cell : column.getCellObjects()) { if (cell.getLineNumber() == lineNumber) { break; } if (cell.getLineNumber() > line.getLineNumber()) { // the last cell? // Add a blank cell to this column. // System.out.println("Add line to :" + column + " in line: " + // line.getLineNumber()); // <span class='ocrx_word' id='word_9' title="bbox 2175 514 2346 555">were</span> Tag t = Tag.valueOf("span"); Attributes attributes = new Attributes(); attributes.put("class", "ocrx_word"); attributes.put("id", "word_ADDEDBYTEA"); attributes.put( "title", "bbox " + column.getAverageX1() + " " + (int) line.getAverageY1() + " " + column.getAverageX2() + " " + (int) line.getAverageY2()); Element newElement = new Element(t, "localhost:8080", attributes); newElement.text(" "); ArrayList<Element> newCell = new ArrayList<Element>(); newCell.add(newElement); // System.out.println("adding: " +newElement.text()); column.addCell(newCell); break COLUMNLOOP; } } } } }
private Elements parseNextNode(String query) { if (!NEXT_NODE_TAG.equals(query)) { throw new IllegalArgumentException("Argument selector part: " + query + " is illegal"); } else { Elements eles = new Elements(); if (elements.size() == 1) { Attributes attributes = new Attributes(); Node nextNode = elements.first().nextSibling(); if (nextNode == null) { return eles; } attributes.put("value", nextNode.toString()); eles.add(new Element(Tag.valueOf("nextnode"), "", attributes)); } else { eles = elements; } return eles; } }
@Test public void testAddBooleanAttribute() { Element div = new Element(Tag.valueOf("div"), ""); div.attr("true", true); div.attr("false", "value"); div.attr("false", false); assertTrue(div.hasAttr("true")); assertEquals("", div.attr("true")); List<Attribute> attributes = div.attributes().asList(); assertEquals("There should be one attribute", 1, attributes.size()); assertTrue("Attribute should be boolean", attributes.get(0) instanceof BooleanAttribute); assertFalse(div.hasAttr("false")); assertEquals("<div true></div>", div.outerHtml()); }
/** * Change the tag of this element. For example, convert a {@code <span>} to a {@code <div>} with * {@code el.tagName("div");}. * * @param tagName new tag name for this element * @return this element, for chaining */ public Element tagName(String tagName) { Validate.notEmpty(tagName, "Tag name must not be empty."); tag = Tag.valueOf(tagName); return this; }
@Override protected String doProcess(File htmlfile, String originalUrl, Intent intent) { try { // String charset = "utf-8"; Connection coon = HttpConnection.connect(originalUrl); coon.followRedirects( false); // we don't want it be redirected to other page,example: 10.254.7.4 Document doc = coon.get(); Element head = doc.head(); Element body = doc.body(); if (body.children().size() == 0) { Log.e(TAG, "body has no child with url=" + originalUrl); return PROCESS_FAILED_URL; } /* Elements meta = head.select("meta"); if(!meta.isEmpty()){ Element m = meta.get(0); String content = m.attr("content"); String attr = content.substring(content.indexOf("charset=")+8); if(!attr.trim().isEmpty()){ charset = attr; } } */ Elements base = head.select("base"); if (base.isEmpty()) { String b = head.baseUri(); Attributes attrs = new Attributes(); attrs.put("href", b); ArrayList<Element> a = new ArrayList<>(); a.add(new Element(Tag.valueOf("base"), b, attrs)); head.insertChildren(0, a); } Element div = doc.select("div.content-main").first(); if (div == null) { Log.e(TAG, "not found specific element with url=" + originalUrl); return PROCESS_FAILED_URL; } Element title = div.select("h1.title").first(); title.remove(); body.empty(); ArrayList<Element> a = new ArrayList<>(); a.add(div); body.insertChildren(0, a); int g = 0; while (g < 2) { // try two times. if (FileUtil.saveStringToFile(doc.toString(), htmlfile, false)) { break; } g++; } if (g < 2) return StringUtils.file2Url(htmlfile, PROCESS_FAILED_URL); Log.e(TAG, "save html to file failed with url=" + originalUrl); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return PROCESS_FAILED_URL; }
/** * Create a new element by tag name, and add it as the first child. * * @param tagName the name of the tag (e.g. {@code div}). * @return the new element, to allow you to add content to it, e.g.: {@code * parent.prependElement("h1").attr("id", "header").text("Welcome");} */ public Element prependElement(String tagName) { Element child = new Element(Tag.valueOf(tagName), baseUri()); prependChild(child); return child; }
/** * Test if this element is a block-level element. (E.g. {@code <div> == true} or an inline element * {@code <p> == false}). * * @return true if block, false if not (and thus inline) */ public boolean isBlock() { return tag.isBlock(); }
/** * Get the name of the tag for this element. E.g. {@code div} * * @return the tag name */ public String tagName() { return tag.getName(); }
@Override public String nodeName() { return tag.getName(); }