Beispiel #1
0
 public static Attributes createAttributes(String[] keys, String[] values) {
   Attributes attrs = new Attributes();
   for (int i = 0; i < keys.length; i += 1) {
     attrs.put(keys[i], values[i]);
   }
   return attrs;
 }
Beispiel #2
0
  /**
   * Get an attribute's value by its key.
   *
   * <p>To get an absolute URL from an attribute that may be a relative URL, prefix the key with
   * <code><b>abs</b></code>, which is a shortcut to the {@link #absUrl} method. E.g.:
   *
   * <blockquote>
   *
   * <code>String url = a.attr("abs:href");</code>
   *
   * </blockquote>
   *
   * @param attributeKey The attribute key.
   * @return The attribute, or empty string if not present (to avoid nulls).
   * @see #attributes()
   * @see #hasAttr(String)
   * @see #absUrl(String)
   */
  public String attr(String attributeKey) {
    Validate.notNull(attributeKey);

    if (attributes.hasKey(attributeKey)) return attributes.get(attributeKey);
    else if (attributeKey.toLowerCase().startsWith("abs:"))
      return absUrl(attributeKey.substring("abs:".length()));
    else return "";
  }
Beispiel #3
0
 public void addAll(Attributes incoming) {
   if (incoming.size() != 0) {
     if (this.attributes == null) {
       this.attributes = new LinkedHashMap(incoming.size());
     }
     this.attributes.putAll(incoming.attributes);
   }
 }
  /**
   * @param attributes A list of attribs
   * @return Returns a mutable map parsed out of the attribute list
   */
  public static Map<String, String> parseAttribs(Attributes attributes) {

    Map<String, String> attrs = new LinkedHashMap<String, String>(attributes.size() + 4);

    for (Attribute a : attributes.asList())
      if (!SKIP_ATTR.contains(a.getKey())) attrs.put(a.getKey(), a.getValue());

    return attrs;
  }
Beispiel #5
0
  /**
   * Test if this element has an attribute.
   *
   * @param attributeKey The attribute key to check.
   * @return true if the attribute exists, false if not.
   */
  public boolean hasAttr(String attributeKey) {
    Validate.notNull(attributeKey);

    if (attributeKey.toLowerCase().startsWith("abs:")) {
      String key = attributeKey.substring("abs:".length());
      if (attributes.hasKey(key) && !absUrl(key).equals("")) return true;
    }
    return attributes.hasKey(attributeKey);
  }
Beispiel #6
0
 Attributes getEnforcedAttributes(String tagName) {
   Attributes attrs = new Attributes();
   TagName tag = TagName.valueOf(tagName);
   if (enforcedAttributes.containsKey(tag)) {
     Map<AttributeKey, AttributeValue> keyVals = enforcedAttributes.get(tag);
     for (Map.Entry<AttributeKey, AttributeValue> entry : keyVals.entrySet()) {
       attrs.put(entry.getKey().toString(), entry.getValue().toString());
     }
   }
   return attrs;
 }
 /**
  * Check if an element is visible based on whether it has an aria presentation tag.
  *
  * @param element
  * @return true if the element is visible rather than just presentation.
  * @todo(dallison) check other aria roles for visible intentions
  */
 static boolean isVisible(Element element) {
   Attributes attributes = element.attributes();
   if (attributes.hasKey("role")) {
     if (attributes.get(ARIA_ROLE).equals(ARIA_PRESENTATION)) {
       return false;
     } else {
       return true;
     }
   } else {
     return true;
   }
 }
  public String reviseImgForIxiqi(String pcont) {
    if (pcont == null) return "";

    Document doc = Jsoup.parse(pcont);
    Elements eleimages = doc.select("img");
    for (Element img : eleimages) {
      Attributes attrs = img.attributes();
      String source = attrs.get("data-original");
      img.attr("src", source);
    }
    return doc.html();
  }
  public String reviseImgForQdaily(String pcont) {
    if (pcont == null) return "";

    Document doc = Jsoup.parse(pcont);
    Elements eleimages = doc.select("img");
    for (Element img : eleimages) {
      Attributes attrs = img.attributes();
      String source = attrs.get("src");
      img.attr("src", "http://qdaily.com/" + source);
    }
    return doc.html();
  }
Beispiel #10
0
  public String reviseImgForYuehui(String pcont) {
    if (pcont == null) return "";

    Document doc = Jsoup.parse(pcont);
    Elements eleimages = doc.select("input[name=\"hiddenimg\"]");
    if (eleimages.size() > 0) {
      for (Element img : eleimages) {
        Attributes attrs = img.attributes();
        String source = attrs.get("value");
        img.parent().before("<img src=\"" + source + "\" />");
      }
    }
    return doc.html();
  }
Beispiel #11
0
  public String reviseImgForSohuNews(String pcont) {
    if (pcont == null) return "";

    Document doc = Jsoup.parse(pcont);
    Elements eleimages = doc.select("img");
    for (Element img : eleimages) {
      Attributes attrs = img.attributes();
      if (attrs.hasKey("data-src")) {
        String source = attrs.get("data-src");
        img.attr("src", source);
      }
    }
    return doc.html();
  }
Beispiel #12
0
 @Override
 public int hashCode() {
   int result = parentNode != null ? parentNode.hashCode() : 0;
   // not children, or will block stack as they go back up to parent)
   result = 31 * result + (attributes != null ? attributes.hashCode() : 0);
   return result;
 }
Beispiel #13
0
  private ElementMeta createSafeElement(Element sourceEl) {
    String sourceTag = sourceEl.tagName();
    Attributes destAttrs = new Attributes();
    Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);
    int numDiscarded = 0;

    Attributes sourceAttrs = sourceEl.attributes();
    for (Attribute sourceAttr : sourceAttrs) {
      if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) destAttrs.put(sourceAttr);
      else numDiscarded++;
    }
    Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
    destAttrs.addAll(enforcedAttrs);

    return new ElementMeta(dest, numDiscarded);
  }
Beispiel #14
0
 public Attributes clone() {
   if (this.attributes == null) {
     return new Attributes();
   }
   try {
     Attributes clone = (Attributes) super.clone();
     clone.attributes = new LinkedHashMap(this.attributes.size());
     Iterator i$ = iterator();
     while (i$.hasNext()) {
       Attribute attribute = (Attribute) i$.next();
       clone.attributes.put(attribute.getKey(), attribute.clone());
     }
     return clone;
   } catch (CloneNotSupportedException e) {
     throw new RuntimeException(e);
   }
 }
Beispiel #15
0
  /**
   * This method is used for extraction of tables with lot of empty cells in it. It is required for
   * the successful extraction of most Matrix tables.
   */
  private void fillBlankCells() {
    // We say: cells get a line number. If a column does not contain a cell on a certain line, add a
    // whitespace.
    // Any cell that is not filled must be empty:
    for (Line line : data) {
      int lineNumber = line.getLineNumber();
      COLUMNLOOP:
      for (Column2 column : dataInColumns) {
        for (Cell cell : column.getCellObjects()) {
          if (cell.getLineNumber() == lineNumber) {
            break;
          }
          if (cell.getLineNumber() > line.getLineNumber()) { // the last cell?
            // Add a blank cell to this column.
            //                        System.out.println("Add line to :" + column + " in line: " +
            // line.getLineNumber());

            // <span class='ocrx_word' id='word_9' title="bbox 2175 514 2346 555">were</span>

            Tag t = Tag.valueOf("span");
            Attributes attributes = new Attributes();
            attributes.put("class", "ocrx_word");
            attributes.put("id", "word_ADDEDBYTEA");
            attributes.put(
                "title",
                "bbox "
                    + column.getAverageX1()
                    + " "
                    + (int) line.getAverageY1()
                    + " "
                    + column.getAverageX2()
                    + " "
                    + (int) line.getAverageY2());

            Element newElement = new Element(t, "localhost:8080", attributes);
            newElement.text(" ");
            ArrayList<Element> newCell = new ArrayList<Element>();
            newCell.add(newElement);
            //                        System.out.println("adding: " +newElement.text());
            column.addCell(newCell);
            break COLUMNLOOP;
          }
        }
      }
    }
  }
Beispiel #16
0
 public String put(String key, String value) {
   String dataKey = Attributes.dataKey(key);
   String oldValue =
       Attributes.this.hasKey(dataKey)
           ? ((Attribute) Attributes.this.attributes.get(dataKey)).getValue()
           : null;
   Attributes.this.attributes.put(dataKey, new Attribute(dataKey, value));
   return oldValue;
 }
Beispiel #17
0
 private Elements parseNextNode(String query) {
   if (!NEXT_NODE_TAG.equals(query)) {
     throw new IllegalArgumentException("Argument selector part: " + query + " is illegal");
   } else {
     Elements eles = new Elements();
     if (elements.size() == 1) {
       Attributes attributes = new Attributes();
       Node nextNode = elements.first().nextSibling();
       if (nextNode == null) {
         return eles;
       }
       attributes.put("value", nextNode.toString());
       eles.add(new Element(Tag.valueOf("nextnode"), "", attributes));
     } else {
       eles = elements;
     }
     return eles;
   }
 }
Beispiel #18
0
  @Test
  public void dataset() {
    Document doc =
        Jsoup.parse(
            "<div id=1 data-name=jsoup class=new data-package=jar>Hello</div><p id=2>Hello</p>");
    Element div = doc.select("div").first();
    Map<String, String> dataset = div.dataset();
    Attributes attributes = div.attributes();

    // size, get, set, add, remove
    assertEquals(2, dataset.size());
    assertEquals("jsoup", dataset.get("name"));
    assertEquals("jar", dataset.get("package"));

    dataset.put("name", "jsoup updated");
    dataset.put("language", "java");
    dataset.remove("package");

    assertEquals(2, dataset.size());
    assertEquals(4, attributes.size());
    assertEquals("jsoup updated", attributes.get("data-name"));
    assertEquals("jsoup updated", dataset.get("name"));
    assertEquals("java", attributes.get("data-language"));
    assertEquals("java", dataset.get("language"));

    attributes.put("data-food", "bacon");
    assertEquals(3, dataset.size());
    assertEquals("bacon", dataset.get("food"));

    attributes.put("data-", "empty");
    assertEquals(null, dataset.get("")); // data- is not a data attribute

    Element p = doc.select("p").first();
    assertEquals(0, p.dataset().size());
  }
Beispiel #19
0
    void newAttribute() {
      if (attributes == null) attributes = new Attributes();

      if (pendingAttributeName != null) {
        Attribute attribute;
        if (pendingAttributeValue == null) attribute = new Attribute(pendingAttributeName, "");
        else attribute = new Attribute(pendingAttributeName, pendingAttributeValue.toString());
        attributes.put(attribute);
      }
      pendingAttributeName = null;
      if (pendingAttributeValue != null)
        pendingAttributeValue.delete(0, pendingAttributeValue.length());
    }
Beispiel #20
0
  public String reviseImgForZhiHuApp(String pcont) {
    if (pcont == null) return "";

    Document doc = Jsoup.parse(pcont);
    Elements noeles = doc.select("noscript");
    for (Element no : noeles) {
      Elements eleimages = no.getElementsByTag("img");
      for (Element img : eleimages) {
        Attributes attrs = img.attributes();
        String source = attrs.get("src");
        img.parent().before("<img src=\"" + source + "\" />");
      }
      no.remove();
    }
    Elements eleimages = doc.select("img");
    for (Element img : eleimages) {
      String source = img.attr("data-original"), s2 = img.attr("data-actualsrc");
      if (!source.equals("")) img.attr("src", source);
      if (!s2.equals("")) img.attr("src", s2);
    }
    return doc.html();
  }
Beispiel #21
0
  protected Node doClone(Node parent) {
    Node clone;
    try {
      clone = (Node) super.clone();
    } catch (CloneNotSupportedException e) {
      throw new RuntimeException(e);
    }

    clone.parentNode = parent; // can be null, to create an orphan split
    clone.siblingIndex = parent == null ? 0 : siblingIndex;
    clone.attributes = attributes != null ? attributes.clone() : null;
    clone.baseUri = baseUri;
    clone.childNodes = new ArrayList<Node>(childNodes.size());
    for (Node child : childNodes)
      clone.childNodes.add(child.doClone(clone)); // clone() creates orphans, doClone() keeps parent

    return clone;
  }
Beispiel #22
0
  @Override
  protected String doProcess(File htmlfile, String originalUrl, Intent intent) {
    try {
      //            String charset = "utf-8";
      Connection coon = HttpConnection.connect(originalUrl);
      coon.followRedirects(
          false); // we don't want it be redirected to other page,example: 10.254.7.4
      Document doc = coon.get();
      Element head = doc.head();
      Element body = doc.body();
      if (body.children().size() == 0) {
        Log.e(TAG, "body has no child with url=" + originalUrl);
        return PROCESS_FAILED_URL;
      }
      /*
      Elements meta = head.select("meta");
      if(!meta.isEmpty()){
          Element m = meta.get(0);
          String content = m.attr("content");
          String attr = content.substring(content.indexOf("charset=")+8);
          if(!attr.trim().isEmpty()){
              charset = attr;
          }
      }
      */
      Elements base = head.select("base");
      if (base.isEmpty()) {
        String b = head.baseUri();
        Attributes attrs = new Attributes();
        attrs.put("href", b);
        ArrayList<Element> a = new ArrayList<>();
        a.add(new Element(Tag.valueOf("base"), b, attrs));
        head.insertChildren(0, a);
      }

      Element div = doc.select("div.content-main").first();
      if (div == null) {
        Log.e(TAG, "not found specific element with url=" + originalUrl);
        return PROCESS_FAILED_URL;
      }
      Element title = div.select("h1.title").first();
      title.remove();
      body.empty();
      ArrayList<Element> a = new ArrayList<>();
      a.add(div);
      body.insertChildren(0, a);
      int g = 0;
      while (g < 2) { // try two times.
        if (FileUtil.saveStringToFile(doc.toString(), htmlfile, false)) {
          break;
        }
        g++;
      }

      if (g < 2) return StringUtils.file2Url(htmlfile, PROCESS_FAILED_URL);
      Log.e(TAG, "save html to file failed with url=" + originalUrl);
    } catch (MalformedURLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    return PROCESS_FAILED_URL;
  }
Beispiel #23
0
  @Test
  public void testHasClassDomMethods() {
    Tag tag = Tag.valueOf("a");
    Attributes attribs = new Attributes();
    Element el = new Element(tag, "", attribs);

    attribs.put("class", "toto");
    boolean hasClass = el.hasClass("toto");
    assertTrue(hasClass);

    attribs.put("class", " toto");
    hasClass = el.hasClass("toto");
    assertTrue(hasClass);

    attribs.put("class", "toto ");
    hasClass = el.hasClass("toto");
    assertTrue(hasClass);

    attribs.put("class", "\ttoto ");
    hasClass = el.hasClass("toto");
    assertTrue(hasClass);

    attribs.put("class", "  toto ");
    hasClass = el.hasClass("toto");
    assertTrue(hasClass);

    attribs.put("class", "ab");
    hasClass = el.hasClass("toto");
    assertFalse(hasClass);

    attribs.put("class", "     ");
    hasClass = el.hasClass("toto");
    assertFalse(hasClass);

    attribs.put("class", "tototo");
    hasClass = el.hasClass("toto");
    assertFalse(hasClass);

    attribs.put("class", "raulpismuth  ");
    hasClass = el.hasClass("raulpismuth");
    assertTrue(hasClass);

    attribs.put("class", " abcd  raulpismuth efgh ");
    hasClass = el.hasClass("raulpismuth");
    assertTrue(hasClass);

    attribs.put("class", " abcd efgh raulpismuth");
    hasClass = el.hasClass("raulpismuth");
    assertTrue(hasClass);

    attribs.put("class", " abcd efgh raulpismuth ");
    hasClass = el.hasClass("raulpismuth");
    assertTrue(hasClass);
  }
Beispiel #24
0
 /**
  * Remove an attribute from this element.
  *
  * @param attributeKey The attribute to remove.
  * @return this (for chaining)
  */
 public Node removeAttr(String attributeKey) {
   Validate.notNull(attributeKey);
   attributes.remove(attributeKey);
   return this;
 }
Beispiel #25
0
 /**
  * Set an attribute (key=value). If the attribute already exists, it is replaced.
  *
  * @param attributeKey The attribute key.
  * @param attributeValue The attribute value.
  * @return this (for chaining)
  */
 public Node attr(String attributeKey, String attributeValue) {
   attributes.put(attributeKey, attributeValue);
   return this;
 }