Пример #1
0
  public static void dealOnePage(String url, int starNo) {
    try {
      Parser parser = new Parser((HttpURLConnection) (new URL(url)).openConnection());
      NodeList tableSet =
          parser.extractAllNodesThatMatch(new HasAttributeFilter("bgcolor", "#DDE1FF"));
      parser = new Parser(new Lexer(tableSet.toHtml()));
      NodeList tdSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("tr"));
      parser = new Parser(new Lexer(tdSet.toHtml()));

      PrototypicalNodeFactory p = new PrototypicalNodeFactory();
      p.registerTag(new SpanTag());
      parser.setNodeFactory(p);

      NodeList spanSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("span"));
      int index = 0;
      for (int i = 5; i < spanSet.size(); i = i + 5) {
        String str = spanSet.elementAt(i).toPlainTextString();
        String now = "" + (starNo * 100 + index);
        index++;
        while (str.compareTo(now) != 0) {
          System.out.println(now);
          now = "" + (starNo * 100 + index);
          index++;
        }
        // System.out.println(str);
      }
    } catch (ParserException e) {
      e.printStackTrace();
    } catch (MalformedURLException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
Пример #2
0
  /**
   * @param <T> 标签类型
   * @param html 需要解析的文本html
   * @param tagType 标签类型 class
   * @param attr 该标签应该有的树形
   * @param value 属性的值 (Ϊnull ��Ϊ��ƥ��)
   * @return
   */
  public static <T extends TagNode> List<T> parseTags(
      String html,
      final Class<T> tagType,
      final String attr,
      final String value,
      final boolean test) {
    Parser parser = new Parser();
    try {
      PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
      factory.registerTag(new PreTag());
      parser.setNodeFactory(factory);
      parser.setInputHTML(html);
      NodeList tagList =
          parser.parse(
              new NodeFilter() {

                @Override
                public boolean accept(Node node) {
                  if (test) logger.info(node.getClass());
                  if (node.getClass() == tagType) {
                    if (attr == null) return true;
                    T tn = (T) node;
                    String attrv = tn.getAttribute(attr);
                    if (value == null && attrv != null) { // || attrv.equals(value)
                      return true;
                    }
                    if (test) logger.info(attrv);
                    if (value != null && attrv != null && attrv.equals(value)) return true;
                  }
                  return false;
                }
              });

      List<T> tags = new ArrayList<T>();
      for (int i = 0; i < tagList.size(); i++) {
        tags.add((T) tagList.elementAt(i));
      }
      return tags;
    } catch (ParserException e) {
      e.printStackTrace();
    }

    return null;
  }
Пример #3
0
  public static NodeList parseAllTags(String html) {
    Parser parser = new Parser();
    try {
      PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
      factory.registerTag(new PreTag());
      parser.setNodeFactory(factory);
      parser.setInputHTML(html);
      NodeList tagList =
          parser.parse(
              new NodeFilter() {
                @Override
                public boolean accept(Node node) {
                  return true;
                }
              });

      return tagList;
    } catch (ParserException e) {
      e.printStackTrace();
    }

    return null;
  }
Пример #4
0
  /**
   * 生成预览内容
   *
   * @param html
   * @param max_count
   * @return
   */
  public static String preview(String html, int max_count) {
    if (html.length() <= max_count * 1.1) return html;
    Parser parser = new Parser();
    StringBuffer prvContent = new StringBuffer();
    try {
      parser.setEncoding(Globals.ENC_8859_1);
      parser.setInputHTML(html);

      parser.setNodeFactory(factory);

      NodeList nodes = parser.extractAllNodesThatMatch(nfilter);
      Node node = null;
      for (int i = 0; i < nodes.size(); i++) {
        if (prvContent.length() >= max_count) {
          if (node instanceof TagNode) {
            TagNode tmp_node = (TagNode) node;
            boolean isEnd = tmp_node.isEndTag();
            if (!isEnd) {
              prvContent.setLength(prvContent.length() - tmp_node.getText().length() - 2);
            }
          }
          // 补齐所有未关闭的标签
          Node parent = node;
          // System.out.println("current node is . "+parent.getText());
          do {
            // System.out.println(parent.getClass().getName()+":"+parent.getText());
            parent = parent.getParent();
            // System.out.println("parent = "+parent);
            if (parent == null) break;
            if (!(parent instanceof TagNode)) continue;
            // System.out.println("Parent node is no ended. "+parent.getText());
            prvContent.append(((TagNode) parent).getEndTag().toHtml());
          } while (true);
          break;
        }
        node = nodes.elementAt(i);
        if (node instanceof TagNode) {
          TagNode tag = (TagNode) node;
          prvContent.append('<');
          prvContent.append(tag.getText());
          prvContent.append('>');
          // System.out.println("TAG: " + '<'+tag.getText()+'>');
        } else if (node instanceof TextNode) {
          int space = max_count - prvContent.length();
          if (space > 10) {
            TextNode text = (TextNode) node;
            if (text.getText().length() < 10) prvContent.append(text.getText());
            else
              prvContent.append(
                  StringUtils.abbreviate(text.getText(), max_count - prvContent.length()));
            // System.out.println("TEXT: " + text.getText());
          }
        }
      }
      return prvContent.toString();
    } catch (ParserException e) {
      e.printStackTrace();
    } finally {
      parser = null;
    }
    return html;
  }