Ejemplo n.º 1
0
  private int getNextOrProvPageScore(String href, Node link, String type) {
    int score = 0;
    String[] types = new String[] {"next", "prov"};

    for (String pageType : types) {
      if (ReadAbilityUtil.match(pageTypePattern.get(pageType + "Link"), href)) {
        if (pageType.equals(type)) score += 25;
        else score -= 25;
        // System.out.println(link.getTextContent()+"[1] "+score);
      }

      if (link.getTextContent() == null) {
        score -= 50;
      }
      if (ReadAbilityUtil.match(pageTypePattern.get(pageType + "Text"), link.getTextContent())) {
        if (pageType.equals(type)) score += 50;
        else score -= 50;
        // System.out.println(link.getTextContent()+"[2] "+score);
      }

      Node parent = link.getParentNode();
      if (parent != null && parent.getChildElementsCount() == 1) {
        if (ReadAbilityUtil.match(
            pageTypePattern.get(pageType + "Text"), parent.getTextContent())) {
          if (pageType.equals(type)) score += 50;
          else score -= 50;
        }
      }
    }
    // System.out.println(link.getTextContent()+" "+score);
    return score;
  }
  @Override
  public List<String> extractValueFromNode(Object node, String expression) {
    Jerry doc = Jerry.jerry(node.toString());
    List<String> list = new ArrayList();
    String replacement = null;

    if (expression.equals("*")) {
      list.add(node.toString());
      return list;
    }

    /*if (expression.equals("#")) {
        list.add(Integer.toString(enumerator++));
        return list;
    }*/

    Node doc2 = doc.get(0);
    NodeSelector nodeSelector = new NodeSelector(doc2);
    List<Node> selectedNodes; // = nodeSelector.select(expression);
    if (expression.contains("href")) selectedNodes = nodeSelector.select("a");
    else selectedNodes = nodeSelector.select(expression);

    for (Node snode : selectedNodes) {
      if (expression.contains("href")) {
        list.add(
            StringEscapeUtils.unescapeHtml(
                snode
                    .getAttribute("href")
                    .toString()
                    .replaceAll(expression, replacement)
                    .trim()
                    .replaceAll("[\\t\\n\\r\\s]{2,}", " ")));
      } else {
        String value =
            StringEscapeUtils.unescapeHtml(
                snode.getTextContent().replaceAll("[\\t\\n\\r\\s]{2,}", " ").trim());
        if (value != null & !value.equals("")) list.add(StringEscapeUtils.unescapeHtml(value));
      }
    }
    /*for (Node snode : selectedNodes) {
        if (snode.getInnerHtml().toString() != null && !snode.getInnerHtml().toString().trim().replaceAll("[\\t\\n\\r\\s]{2,}", " ").equals("")) {
            list.add(snode.getInnerHtml().toString().trim().replaceAll("[\\t\\n\\r\\s]{2,}", " "));
        }
    }*/
    return list;
  }