@Override
  public List<String> extractValueFromNode(Object node, String expression) {
    Jerry doc = Jerry.jerry(node.toString());
    List<String> list = new ArrayList();
    String replacement = null;

    if (expression.equals("*")) {
      list.add(node.toString());
      return list;
    }

    /*if (expression.equals("#")) {
        list.add(Integer.toString(enumerator++));
        return list;
    }*/

    Node doc2 = doc.get(0);
    NodeSelector nodeSelector = new NodeSelector(doc2);
    List<Node> selectedNodes; // = nodeSelector.select(expression);
    if (expression.contains("href")) selectedNodes = nodeSelector.select("a");
    else selectedNodes = nodeSelector.select(expression);

    for (Node snode : selectedNodes) {
      if (expression.contains("href")) {
        list.add(
            StringEscapeUtils.unescapeHtml(
                snode
                    .getAttribute("href")
                    .toString()
                    .replaceAll(expression, replacement)
                    .trim()
                    .replaceAll("[\\t\\n\\r\\s]{2,}", " ")));
      } else {
        String value =
            StringEscapeUtils.unescapeHtml(
                snode.getTextContent().replaceAll("[\\t\\n\\r\\s]{2,}", " ").trim());
        if (value != null & !value.equals("")) list.add(StringEscapeUtils.unescapeHtml(value));
      }
    }
    /*for (Node snode : selectedNodes) {
        if (snode.getInnerHtml().toString() != null && !snode.getInnerHtml().toString().trim().replaceAll("[\\t\\n\\r\\s]{2,}", " ").equals("")) {
            list.add(snode.getInnerHtml().toString().trim().replaceAll("[\\t\\n\\r\\s]{2,}", " "));
        }
    }*/
    return list;
  }
Esempio n. 2
0
  /**
   * 查找页面url
   *
   * @throws MalformedURLException
   */
  public UrlItem parse() throws MalformedURLException {
    UrlItem nextPageUrl = null;
    Node[] allLink = document.$("a").get();
    if (allLink == null || allLink.length == 0) {
      return nextPageUrl;
    }
    Map<String, ScoredElement<UrlItem>> possibleLinks = getPossiblePageUrls(allLink, type);
    nextPageUrl =
        new ScoredElement<UrlItem>(0, nextPageUrl).getTopElement(possibleLinks.values(), 0);

    return nextPageUrl;
  }