Beispiel #1
0
  @Test
  public void dataset() {
    Document doc =
        Jsoup.parse(
            "<div id=1 data-name=jsoup class=new data-package=jar>Hello</div><p id=2>Hello</p>");
    Element div = doc.select("div").first();
    Map<String, String> dataset = div.dataset();
    Attributes attributes = div.attributes();

    // size, get, set, add, remove
    assertEquals(2, dataset.size());
    assertEquals("jsoup", dataset.get("name"));
    assertEquals("jar", dataset.get("package"));

    dataset.put("name", "jsoup updated");
    dataset.put("language", "java");
    dataset.remove("package");

    assertEquals(2, dataset.size());
    assertEquals(4, attributes.size());
    assertEquals("jsoup updated", attributes.get("data-name"));
    assertEquals("jsoup updated", dataset.get("name"));
    assertEquals("java", attributes.get("data-language"));
    assertEquals("java", dataset.get("language"));

    attributes.put("data-food", "bacon");
    assertEquals(3, dataset.size());
    assertEquals("bacon", dataset.get("food"));

    attributes.put("data-", "empty");
    assertEquals(null, dataset.get("")); // data- is not a data attribute

    Element p = doc.select("p").first();
    assertEquals(0, p.dataset().size());
  }
Beispiel #2
0
 public static Attributes createAttributes(String[] keys, String[] values) {
   Attributes attrs = new Attributes();
   for (int i = 0; i < keys.length; i += 1) {
     attrs.put(keys[i], values[i]);
   }
   return attrs;
 }
Beispiel #3
0
  /**
   * This method is used for extraction of tables with lot of empty cells in it. It is required for
   * the successful extraction of most Matrix tables.
   */
  private void fillBlankCells() {
    // We say: cells get a line number. If a column does not contain a cell on a certain line, add a
    // whitespace.
    // Any cell that is not filled must be empty:
    for (Line line : data) {
      int lineNumber = line.getLineNumber();
      COLUMNLOOP:
      for (Column2 column : dataInColumns) {
        for (Cell cell : column.getCellObjects()) {
          if (cell.getLineNumber() == lineNumber) {
            break;
          }
          if (cell.getLineNumber() > line.getLineNumber()) { // the last cell?
            // Add a blank cell to this column.
            //                        System.out.println("Add line to :" + column + " in line: " +
            // line.getLineNumber());

            // <span class='ocrx_word' id='word_9' title="bbox 2175 514 2346 555">were</span>

            Tag t = Tag.valueOf("span");
            Attributes attributes = new Attributes();
            attributes.put("class", "ocrx_word");
            attributes.put("id", "word_ADDEDBYTEA");
            attributes.put(
                "title",
                "bbox "
                    + column.getAverageX1()
                    + " "
                    + (int) line.getAverageY1()
                    + " "
                    + column.getAverageX2()
                    + " "
                    + (int) line.getAverageY2());

            Element newElement = new Element(t, "localhost:8080", attributes);
            newElement.text(" ");
            ArrayList<Element> newCell = new ArrayList<Element>();
            newCell.add(newElement);
            //                        System.out.println("adding: " +newElement.text());
            column.addCell(newCell);
            break COLUMNLOOP;
          }
        }
      }
    }
  }
Beispiel #4
0
 Attributes getEnforcedAttributes(String tagName) {
   Attributes attrs = new Attributes();
   TagName tag = TagName.valueOf(tagName);
   if (enforcedAttributes.containsKey(tag)) {
     Map<AttributeKey, AttributeValue> keyVals = enforcedAttributes.get(tag);
     for (Map.Entry<AttributeKey, AttributeValue> entry : keyVals.entrySet()) {
       attrs.put(entry.getKey().toString(), entry.getValue().toString());
     }
   }
   return attrs;
 }
Beispiel #5
0
    void newAttribute() {
      if (attributes == null) attributes = new Attributes();

      if (pendingAttributeName != null) {
        Attribute attribute;
        if (pendingAttributeValue == null) attribute = new Attribute(pendingAttributeName, "");
        else attribute = new Attribute(pendingAttributeName, pendingAttributeValue.toString());
        attributes.put(attribute);
      }
      pendingAttributeName = null;
      if (pendingAttributeValue != null)
        pendingAttributeValue.delete(0, pendingAttributeValue.length());
    }
Beispiel #6
0
  private ElementMeta createSafeElement(Element sourceEl) {
    String sourceTag = sourceEl.tagName();
    Attributes destAttrs = new Attributes();
    Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);
    int numDiscarded = 0;

    Attributes sourceAttrs = sourceEl.attributes();
    for (Attribute sourceAttr : sourceAttrs) {
      if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) destAttrs.put(sourceAttr);
      else numDiscarded++;
    }
    Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
    destAttrs.addAll(enforcedAttrs);

    return new ElementMeta(dest, numDiscarded);
  }
Beispiel #7
0
 private Elements parseNextNode(String query) {
   if (!NEXT_NODE_TAG.equals(query)) {
     throw new IllegalArgumentException("Argument selector part: " + query + " is illegal");
   } else {
     Elements eles = new Elements();
     if (elements.size() == 1) {
       Attributes attributes = new Attributes();
       Node nextNode = elements.first().nextSibling();
       if (nextNode == null) {
         return eles;
       }
       attributes.put("value", nextNode.toString());
       eles.add(new Element(Tag.valueOf("nextnode"), "", attributes));
     } else {
       eles = elements;
     }
     return eles;
   }
 }
Beispiel #8
0
 public void put(String key, String value) {
   put(new Attribute(key, value));
 }
Beispiel #9
0
 /**
  * Set an attribute (key=value). If the attribute already exists, it is replaced.
  *
  * @param attributeKey The attribute key.
  * @param attributeValue The attribute value.
  * @return this (for chaining)
  */
 public Node attr(String attributeKey, String attributeValue) {
   attributes.put(attributeKey, attributeValue);
   return this;
 }
Beispiel #10
0
  @Override
  protected String doProcess(File htmlfile, String originalUrl, Intent intent) {
    try {
      //            String charset = "utf-8";
      Connection coon = HttpConnection.connect(originalUrl);
      coon.followRedirects(
          false); // we don't want it be redirected to other page,example: 10.254.7.4
      Document doc = coon.get();
      Element head = doc.head();
      Element body = doc.body();
      if (body.children().size() == 0) {
        Log.e(TAG, "body has no child with url=" + originalUrl);
        return PROCESS_FAILED_URL;
      }
      /*
      Elements meta = head.select("meta");
      if(!meta.isEmpty()){
          Element m = meta.get(0);
          String content = m.attr("content");
          String attr = content.substring(content.indexOf("charset=")+8);
          if(!attr.trim().isEmpty()){
              charset = attr;
          }
      }
      */
      Elements base = head.select("base");
      if (base.isEmpty()) {
        String b = head.baseUri();
        Attributes attrs = new Attributes();
        attrs.put("href", b);
        ArrayList<Element> a = new ArrayList<>();
        a.add(new Element(Tag.valueOf("base"), b, attrs));
        head.insertChildren(0, a);
      }

      Element div = doc.select("div.content-main").first();
      if (div == null) {
        Log.e(TAG, "not found specific element with url=" + originalUrl);
        return PROCESS_FAILED_URL;
      }
      Element title = div.select("h1.title").first();
      title.remove();
      body.empty();
      ArrayList<Element> a = new ArrayList<>();
      a.add(div);
      body.insertChildren(0, a);
      int g = 0;
      while (g < 2) { // try two times.
        if (FileUtil.saveStringToFile(doc.toString(), htmlfile, false)) {
          break;
        }
        g++;
      }

      if (g < 2) return StringUtils.file2Url(htmlfile, PROCESS_FAILED_URL);
      Log.e(TAG, "save html to file failed with url=" + originalUrl);
    } catch (MalformedURLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    return PROCESS_FAILED_URL;
  }
Beispiel #11
0
  @Test
  public void testHasClassDomMethods() {
    Tag tag = Tag.valueOf("a");
    Attributes attribs = new Attributes();
    Element el = new Element(tag, "", attribs);

    attribs.put("class", "toto");
    boolean hasClass = el.hasClass("toto");
    assertTrue(hasClass);

    attribs.put("class", " toto");
    hasClass = el.hasClass("toto");
    assertTrue(hasClass);

    attribs.put("class", "toto ");
    hasClass = el.hasClass("toto");
    assertTrue(hasClass);

    attribs.put("class", "\ttoto ");
    hasClass = el.hasClass("toto");
    assertTrue(hasClass);

    attribs.put("class", "  toto ");
    hasClass = el.hasClass("toto");
    assertTrue(hasClass);

    attribs.put("class", "ab");
    hasClass = el.hasClass("toto");
    assertFalse(hasClass);

    attribs.put("class", "     ");
    hasClass = el.hasClass("toto");
    assertFalse(hasClass);

    attribs.put("class", "tototo");
    hasClass = el.hasClass("toto");
    assertFalse(hasClass);

    attribs.put("class", "raulpismuth  ");
    hasClass = el.hasClass("raulpismuth");
    assertTrue(hasClass);

    attribs.put("class", " abcd  raulpismuth efgh ");
    hasClass = el.hasClass("raulpismuth");
    assertTrue(hasClass);

    attribs.put("class", " abcd efgh raulpismuth");
    hasClass = el.hasClass("raulpismuth");
    assertTrue(hasClass);

    attribs.put("class", " abcd efgh raulpismuth ");
    hasClass = el.hasClass("raulpismuth");
    assertTrue(hasClass);
  }