@Test public void dataset() { Document doc = Jsoup.parse( "<div id=1 data-name=jsoup class=new data-package=jar>Hello</div><p id=2>Hello</p>"); Element div = doc.select("div").first(); Map<String, String> dataset = div.dataset(); Attributes attributes = div.attributes(); // size, get, set, add, remove assertEquals(2, dataset.size()); assertEquals("jsoup", dataset.get("name")); assertEquals("jar", dataset.get("package")); dataset.put("name", "jsoup updated"); dataset.put("language", "java"); dataset.remove("package"); assertEquals(2, dataset.size()); assertEquals(4, attributes.size()); assertEquals("jsoup updated", attributes.get("data-name")); assertEquals("jsoup updated", dataset.get("name")); assertEquals("java", attributes.get("data-language")); assertEquals("java", dataset.get("language")); attributes.put("data-food", "bacon"); assertEquals(3, dataset.size()); assertEquals("bacon", dataset.get("food")); attributes.put("data-", "empty"); assertEquals(null, dataset.get("")); // data- is not a data attribute Element p = doc.select("p").first(); assertEquals(0, p.dataset().size()); }
public static Attributes createAttributes(String[] keys, String[] values) { Attributes attrs = new Attributes(); for (int i = 0; i < keys.length; i += 1) { attrs.put(keys[i], values[i]); } return attrs; }
/** * This method is used for extraction of tables with lot of empty cells in it. It is required for * the successful extraction of most Matrix tables. */ private void fillBlankCells() { // We say: cells get a line number. If a column does not contain a cell on a certain line, add a // whitespace. // Any cell that is not filled must be empty: for (Line line : data) { int lineNumber = line.getLineNumber(); COLUMNLOOP: for (Column2 column : dataInColumns) { for (Cell cell : column.getCellObjects()) { if (cell.getLineNumber() == lineNumber) { break; } if (cell.getLineNumber() > line.getLineNumber()) { // the last cell? // Add a blank cell to this column. // System.out.println("Add line to :" + column + " in line: " + // line.getLineNumber()); // <span class='ocrx_word' id='word_9' title="bbox 2175 514 2346 555">were</span> Tag t = Tag.valueOf("span"); Attributes attributes = new Attributes(); attributes.put("class", "ocrx_word"); attributes.put("id", "word_ADDEDBYTEA"); attributes.put( "title", "bbox " + column.getAverageX1() + " " + (int) line.getAverageY1() + " " + column.getAverageX2() + " " + (int) line.getAverageY2()); Element newElement = new Element(t, "localhost:8080", attributes); newElement.text(" "); ArrayList<Element> newCell = new ArrayList<Element>(); newCell.add(newElement); // System.out.println("adding: " +newElement.text()); column.addCell(newCell); break COLUMNLOOP; } } } } }
Attributes getEnforcedAttributes(String tagName) { Attributes attrs = new Attributes(); TagName tag = TagName.valueOf(tagName); if (enforcedAttributes.containsKey(tag)) { Map<AttributeKey, AttributeValue> keyVals = enforcedAttributes.get(tag); for (Map.Entry<AttributeKey, AttributeValue> entry : keyVals.entrySet()) { attrs.put(entry.getKey().toString(), entry.getValue().toString()); } } return attrs; }
void newAttribute() { if (attributes == null) attributes = new Attributes(); if (pendingAttributeName != null) { Attribute attribute; if (pendingAttributeValue == null) attribute = new Attribute(pendingAttributeName, ""); else attribute = new Attribute(pendingAttributeName, pendingAttributeValue.toString()); attributes.put(attribute); } pendingAttributeName = null; if (pendingAttributeValue != null) pendingAttributeValue.delete(0, pendingAttributeValue.length()); }
private ElementMeta createSafeElement(Element sourceEl) { String sourceTag = sourceEl.tagName(); Attributes destAttrs = new Attributes(); Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs); int numDiscarded = 0; Attributes sourceAttrs = sourceEl.attributes(); for (Attribute sourceAttr : sourceAttrs) { if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) destAttrs.put(sourceAttr); else numDiscarded++; } Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag); destAttrs.addAll(enforcedAttrs); return new ElementMeta(dest, numDiscarded); }
private Elements parseNextNode(String query) { if (!NEXT_NODE_TAG.equals(query)) { throw new IllegalArgumentException("Argument selector part: " + query + " is illegal"); } else { Elements eles = new Elements(); if (elements.size() == 1) { Attributes attributes = new Attributes(); Node nextNode = elements.first().nextSibling(); if (nextNode == null) { return eles; } attributes.put("value", nextNode.toString()); eles.add(new Element(Tag.valueOf("nextnode"), "", attributes)); } else { eles = elements; } return eles; } }
public void put(String key, String value) { put(new Attribute(key, value)); }
/** * Set an attribute (key=value). If the attribute already exists, it is replaced. * * @param attributeKey The attribute key. * @param attributeValue The attribute value. * @return this (for chaining) */ public Node attr(String attributeKey, String attributeValue) { attributes.put(attributeKey, attributeValue); return this; }
@Override protected String doProcess(File htmlfile, String originalUrl, Intent intent) { try { // String charset = "utf-8"; Connection coon = HttpConnection.connect(originalUrl); coon.followRedirects( false); // we don't want it be redirected to other page,example: 10.254.7.4 Document doc = coon.get(); Element head = doc.head(); Element body = doc.body(); if (body.children().size() == 0) { Log.e(TAG, "body has no child with url=" + originalUrl); return PROCESS_FAILED_URL; } /* Elements meta = head.select("meta"); if(!meta.isEmpty()){ Element m = meta.get(0); String content = m.attr("content"); String attr = content.substring(content.indexOf("charset=")+8); if(!attr.trim().isEmpty()){ charset = attr; } } */ Elements base = head.select("base"); if (base.isEmpty()) { String b = head.baseUri(); Attributes attrs = new Attributes(); attrs.put("href", b); ArrayList<Element> a = new ArrayList<>(); a.add(new Element(Tag.valueOf("base"), b, attrs)); head.insertChildren(0, a); } Element div = doc.select("div.content-main").first(); if (div == null) { Log.e(TAG, "not found specific element with url=" + originalUrl); return PROCESS_FAILED_URL; } Element title = div.select("h1.title").first(); title.remove(); body.empty(); ArrayList<Element> a = new ArrayList<>(); a.add(div); body.insertChildren(0, a); int g = 0; while (g < 2) { // try two times. if (FileUtil.saveStringToFile(doc.toString(), htmlfile, false)) { break; } g++; } if (g < 2) return StringUtils.file2Url(htmlfile, PROCESS_FAILED_URL); Log.e(TAG, "save html to file failed with url=" + originalUrl); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return PROCESS_FAILED_URL; }
@Test public void testHasClassDomMethods() { Tag tag = Tag.valueOf("a"); Attributes attribs = new Attributes(); Element el = new Element(tag, "", attribs); attribs.put("class", "toto"); boolean hasClass = el.hasClass("toto"); assertTrue(hasClass); attribs.put("class", " toto"); hasClass = el.hasClass("toto"); assertTrue(hasClass); attribs.put("class", "toto "); hasClass = el.hasClass("toto"); assertTrue(hasClass); attribs.put("class", "\ttoto "); hasClass = el.hasClass("toto"); assertTrue(hasClass); attribs.put("class", " toto "); hasClass = el.hasClass("toto"); assertTrue(hasClass); attribs.put("class", "ab"); hasClass = el.hasClass("toto"); assertFalse(hasClass); attribs.put("class", " "); hasClass = el.hasClass("toto"); assertFalse(hasClass); attribs.put("class", "tototo"); hasClass = el.hasClass("toto"); assertFalse(hasClass); attribs.put("class", "raulpismuth "); hasClass = el.hasClass("raulpismuth"); assertTrue(hasClass); attribs.put("class", " abcd raulpismuth efgh "); hasClass = el.hasClass("raulpismuth"); assertTrue(hasClass); attribs.put("class", " abcd efgh raulpismuth"); hasClass = el.hasClass("raulpismuth"); assertTrue(hasClass); attribs.put("class", " abcd efgh raulpismuth "); hasClass = el.hasClass("raulpismuth"); assertTrue(hasClass); }