public Map<String, ScoredElement<UrlItem>> getPossiblePageUrls(Node[] allLink, String type) throws MalformedURLException { Map<String, ScoredElement<UrlItem>> possibleLinks = new HashMap<String, ScoredElement<UrlItem>>(); for (Node link : allLink) { String href = link.getAttribute("href"); if (StringUtil.isEmpty(href)) { continue; } href = href.replaceAll("#.*$", ""); href = href.replaceAll("/$", ""); href = Util.getAbsouluteUrl(href, currentUrl); if (currentUrl.getUrl().equals(href)) { continue; } String base = "://" + currentUrl.getUri().getHost(); if (href.indexOf(base) == -1) { continue; } int score = getPageUrlScore(href, link, type); UrlItem url = new UrlItem(href); ScoredElement<UrlItem> element = new ScoredElement<UrlItem>(score, url); possibleLinks.put(url.getKey(), element); } return possibleLinks; }
@Override public List<String> extractValueFromNode(Object node, String expression) { Jerry doc = Jerry.jerry(node.toString()); List<String> list = new ArrayList(); String replacement = null; if (expression.equals("*")) { list.add(node.toString()); return list; } /*if (expression.equals("#")) { list.add(Integer.toString(enumerator++)); return list; }*/ Node doc2 = doc.get(0); NodeSelector nodeSelector = new NodeSelector(doc2); List<Node> selectedNodes; // = nodeSelector.select(expression); if (expression.contains("href")) selectedNodes = nodeSelector.select("a"); else selectedNodes = nodeSelector.select(expression); for (Node snode : selectedNodes) { if (expression.contains("href")) { list.add( StringEscapeUtils.unescapeHtml( snode .getAttribute("href") .toString() .replaceAll(expression, replacement) .trim() .replaceAll("[\\t\\n\\r\\s]{2,}", " "))); } else { String value = StringEscapeUtils.unescapeHtml( snode.getTextContent().replaceAll("[\\t\\n\\r\\s]{2,}", " ").trim()); if (value != null & !value.equals("")) list.add(StringEscapeUtils.unescapeHtml(value)); } } /*for (Node snode : selectedNodes) { if (snode.getInnerHtml().toString() != null && !snode.getInnerHtml().toString().trim().replaceAll("[\\t\\n\\r\\s]{2,}", " ").equals("")) { list.add(snode.getInnerHtml().toString().trim().replaceAll("[\\t\\n\\r\\s]{2,}", " ")); } }*/ return list; }
private int getNextOrProvPageScore(String href, Node link, String type) { int score = 0; String[] types = new String[] {"next", "prov"}; for (String pageType : types) { if (ReadAbilityUtil.match(pageTypePattern.get(pageType + "Link"), href)) { if (pageType.equals(type)) score += 25; else score -= 25; // System.out.println(link.getTextContent()+"[1] "+score); } if (link.getTextContent() == null) { score -= 50; } if (ReadAbilityUtil.match(pageTypePattern.get(pageType + "Text"), link.getTextContent())) { if (pageType.equals(type)) score += 50; else score -= 50; // System.out.println(link.getTextContent()+"[2] "+score); } Node parent = link.getParentNode(); if (parent != null && parent.getChildElementsCount() == 1) { if (ReadAbilityUtil.match( pageTypePattern.get(pageType + "Text"), parent.getTextContent())) { if (pageType.equals(type)) score += 50; else score -= 50; } } } // System.out.println(link.getTextContent()+" "+score); return score; }