예제 #1
0
파일: ParseUrl.java 프로젝트: ewinli/crawer
  public Map<String, ScoredElement<UrlItem>> getPossiblePageUrls(Node[] allLink, String type)
      throws MalformedURLException {
    Map<String, ScoredElement<UrlItem>> possibleLinks =
        new HashMap<String, ScoredElement<UrlItem>>();

    for (Node link : allLink) {

      String href = link.getAttribute("href");
      if (StringUtil.isEmpty(href)) {
        continue;
      }
      href = href.replaceAll("#.*$", "");
      href = href.replaceAll("/$", "");
      href = Util.getAbsouluteUrl(href, currentUrl);
      if (currentUrl.getUrl().equals(href)) {
        continue;
      }
      String base = "://" + currentUrl.getUri().getHost();
      if (href.indexOf(base) == -1) {
        continue;
      }

      int score = getPageUrlScore(href, link, type);
      UrlItem url = new UrlItem(href);
      ScoredElement<UrlItem> element = new ScoredElement<UrlItem>(score, url);
      possibleLinks.put(url.getKey(), element);
    }
    return possibleLinks;
  }
  @Override
  public List<String> extractValueFromNode(Object node, String expression) {
    Jerry doc = Jerry.jerry(node.toString());
    List<String> list = new ArrayList();
    String replacement = null;

    if (expression.equals("*")) {
      list.add(node.toString());
      return list;
    }

    /*if (expression.equals("#")) {
        list.add(Integer.toString(enumerator++));
        return list;
    }*/

    Node doc2 = doc.get(0);
    NodeSelector nodeSelector = new NodeSelector(doc2);
    List<Node> selectedNodes; // = nodeSelector.select(expression);
    if (expression.contains("href")) selectedNodes = nodeSelector.select("a");
    else selectedNodes = nodeSelector.select(expression);

    for (Node snode : selectedNodes) {
      if (expression.contains("href")) {
        list.add(
            StringEscapeUtils.unescapeHtml(
                snode
                    .getAttribute("href")
                    .toString()
                    .replaceAll(expression, replacement)
                    .trim()
                    .replaceAll("[\\t\\n\\r\\s]{2,}", " ")));
      } else {
        String value =
            StringEscapeUtils.unescapeHtml(
                snode.getTextContent().replaceAll("[\\t\\n\\r\\s]{2,}", " ").trim());
        if (value != null & !value.equals("")) list.add(StringEscapeUtils.unescapeHtml(value));
      }
    }
    /*for (Node snode : selectedNodes) {
        if (snode.getInnerHtml().toString() != null && !snode.getInnerHtml().toString().trim().replaceAll("[\\t\\n\\r\\s]{2,}", " ").equals("")) {
            list.add(snode.getInnerHtml().toString().trim().replaceAll("[\\t\\n\\r\\s]{2,}", " "));
        }
    }*/
    return list;
  }