@Override public List<String> extractValueFromNode(Object node, String expression) { Jerry doc = Jerry.jerry(node.toString()); List<String> list = new ArrayList(); String replacement = null; if (expression.equals("*")) { list.add(node.toString()); return list; } /*if (expression.equals("#")) { list.add(Integer.toString(enumerator++)); return list; }*/ Node doc2 = doc.get(0); NodeSelector nodeSelector = new NodeSelector(doc2); List<Node> selectedNodes; // = nodeSelector.select(expression); if (expression.contains("href")) selectedNodes = nodeSelector.select("a"); else selectedNodes = nodeSelector.select(expression); for (Node snode : selectedNodes) { if (expression.contains("href")) { list.add( StringEscapeUtils.unescapeHtml( snode .getAttribute("href") .toString() .replaceAll(expression, replacement) .trim() .replaceAll("[\\t\\n\\r\\s]{2,}", " "))); } else { String value = StringEscapeUtils.unescapeHtml( snode.getTextContent().replaceAll("[\\t\\n\\r\\s]{2,}", " ").trim()); if (value != null & !value.equals("")) list.add(StringEscapeUtils.unescapeHtml(value)); } } /*for (Node snode : selectedNodes) { if (snode.getInnerHtml().toString() != null && !snode.getInnerHtml().toString().trim().replaceAll("[\\t\\n\\r\\s]{2,}", " ").equals("")) { list.add(snode.getInnerHtml().toString().trim().replaceAll("[\\t\\n\\r\\s]{2,}", " ")); } }*/ return list; }
/** * 查找页面url * * @throws MalformedURLException */ public UrlItem parse() throws MalformedURLException { UrlItem nextPageUrl = null; Node[] allLink = document.$("a").get(); if (allLink == null || allLink.length == 0) { return nextPageUrl; } Map<String, ScoredElement<UrlItem>> possibleLinks = getPossiblePageUrls(allLink, type); nextPageUrl = new ScoredElement<UrlItem>(0, nextPageUrl).getTopElement(possibleLinks.values(), 0); return nextPageUrl; }