Пример #1
0
  public Map<String, ScoredElement<UrlItem>> getPossiblePageUrls(Node[] allLink, String type)
      throws MalformedURLException {
    Map<String, ScoredElement<UrlItem>> possibleLinks =
        new HashMap<String, ScoredElement<UrlItem>>();

    for (Node link : allLink) {

      String href = link.getAttribute("href");
      if (StringUtil.isEmpty(href)) {
        continue;
      }
      href = href.replaceAll("#.*$", "");
      href = href.replaceAll("/$", "");
      href = Util.getAbsouluteUrl(href, currentUrl);
      if (currentUrl.getUrl().equals(href)) {
        continue;
      }
      String base = "://" + currentUrl.getUri().getHost();
      if (href.indexOf(base) == -1) {
        continue;
      }

      int score = getPageUrlScore(href, link, type);
      UrlItem url = new UrlItem(href);
      ScoredElement<UrlItem> element = new ScoredElement<UrlItem>(score, url);
      possibleLinks.put(url.getKey(), element);
    }
    return possibleLinks;
  }
  @Override
  public List<String> extractValueFromNode(Object node, String expression) {
    Jerry doc = Jerry.jerry(node.toString());
    List<String> list = new ArrayList();
    String replacement = null;

    if (expression.equals("*")) {
      list.add(node.toString());
      return list;
    }

    /*if (expression.equals("#")) {
        list.add(Integer.toString(enumerator++));
        return list;
    }*/

    Node doc2 = doc.get(0);
    NodeSelector nodeSelector = new NodeSelector(doc2);
    List<Node> selectedNodes; // = nodeSelector.select(expression);
    if (expression.contains("href")) selectedNodes = nodeSelector.select("a");
    else selectedNodes = nodeSelector.select(expression);

    for (Node snode : selectedNodes) {
      if (expression.contains("href")) {
        list.add(
            StringEscapeUtils.unescapeHtml(
                snode
                    .getAttribute("href")
                    .toString()
                    .replaceAll(expression, replacement)
                    .trim()
                    .replaceAll("[\\t\\n\\r\\s]{2,}", " ")));
      } else {
        String value =
            StringEscapeUtils.unescapeHtml(
                snode.getTextContent().replaceAll("[\\t\\n\\r\\s]{2,}", " ").trim());
        if (value != null & !value.equals("")) list.add(StringEscapeUtils.unescapeHtml(value));
      }
    }
    /*for (Node snode : selectedNodes) {
        if (snode.getInnerHtml().toString() != null && !snode.getInnerHtml().toString().trim().replaceAll("[\\t\\n\\r\\s]{2,}", " ").equals("")) {
            list.add(snode.getInnerHtml().toString().trim().replaceAll("[\\t\\n\\r\\s]{2,}", " "));
        }
    }*/
    return list;
  }
Пример #3
0
  private int getNextOrProvPageScore(String href, Node link, String type) {
    int score = 0;
    String[] types = new String[] {"next", "prov"};

    for (String pageType : types) {
      if (ReadAbilityUtil.match(pageTypePattern.get(pageType + "Link"), href)) {
        if (pageType.equals(type)) score += 25;
        else score -= 25;
        // System.out.println(link.getTextContent()+"[1] "+score);
      }

      if (link.getTextContent() == null) {
        score -= 50;
      }
      if (ReadAbilityUtil.match(pageTypePattern.get(pageType + "Text"), link.getTextContent())) {
        if (pageType.equals(type)) score += 50;
        else score -= 50;
        // System.out.println(link.getTextContent()+"[2] "+score);
      }

      Node parent = link.getParentNode();
      if (parent != null && parent.getChildElementsCount() == 1) {
        if (ReadAbilityUtil.match(
            pageTypePattern.get(pageType + "Text"), parent.getTextContent())) {
          if (pageType.equals(type)) score += 50;
          else score -= 50;
        }
      }
    }
    // System.out.println(link.getTextContent()+" "+score);
    return score;
  }