// If there is a <title> element on the start page, use that as our AU // name. String recomputeRegName() { if (!isStarted()) { // This can get invoked (seveeral times, mostly from logging) before // enough mechanism has started to make it possible to resolve the CuUrl // below. return null; } try { CachedUrl cu = makeCachedUrl(m_registryUrl); if (cu == null) return null; URL cuUrl = CuUrl.fromCu(cu); Parser parser = new Parser(cuUrl.toString()); NodeList nodelst = parser.extractAllNodesThatMatch(new NodeClassFilter(TitleTag.class)); Node nodes[] = nodelst.toNodeArray(); recomputeRegName = false; if (nodes.length < 1) return null; // Get the first title found TitleTag tag = (TitleTag) nodes[0]; if (tag == null) return null; return tag.getTitle(); } catch (MalformedURLException e) { log.warning("recomputeRegName", e); return null; } catch (ParserException e) { if (e.getThrowable() instanceof FileNotFoundException) { log.warning("recomputeRegName: " + e.getThrowable().toString()); } else { log.warning("recomputeRegName", e); } return null; } }
// 获取一个网站上的链接,filter 用来过滤链接 public static Set<String> extracLinks(String url, NodeFilter filter) { Set<String> links = new HashSet<String>(); try { Parser parser = new Parser(url); parser.setEncoding("UTF-8"); @SuppressWarnings("serial") NodeFilter frameFilter = new NodeFilter() { public boolean accept(Node node) { if (node.getText().startsWith("frame src=")) { return true; } else { return false; } } }; OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); NodeList list = parser.extractAllNodesThatMatch(linkFilter); System.out.println("length=" + list.size()); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof LinkTag) { // <a> 标签 LinkTag link = (LinkTag) tag; String linkUrl = link.getLink(); // URL /* * if (filter.accept(linkUrl)) { links.add(linkUrl); } */ System.out.println("linkUrl=" + linkUrl); if (filter.accept(tag)) { links.add(linkUrl); } } else { // <frame> 标签 // 提取 frame 里 src 属性的链接,如 <frame src="test.html"/> String frame = tag.getText(); int start = frame.indexOf("src="); frame = frame.substring(start); int end = frame.indexOf(" "); if (end == -1) { end = frame.indexOf(">"); } String frameUrl = frame.substring(5, end - 1); // if (filter.accept(frameUrl)) { // links.add(frameUrl); // } System.out.println("frameUrl=" + frameUrl); if (filter.accept(tag)) { links.add(frameUrl); } } } /* * NodeFilter filter = new TagNameFilter("DIV"); NodeList nodes = * parser.extractAllNodesThatMatch(filter); if(nodes!=null) { for * (int i = 0; i < nodes.size(); i++) { Node textnode = (Node) * nodes.elementAt(i); * System.out.println("getText:"+textnode.getText()); * System.out.println * ("================================================="); } } */ /* * for(NodeIterator i = parser.elements (); i.hasMoreNodes(); ) { * Node node = i.nextNode(); * System.out.println("getText:"+node.getText()); * System.out.println("getPlainText:"+node.toPlainTextString()); * System.out.println("toHtml:"+node.toHtml()); * System.out.println("toHtml(true):"+node.toHtml(true)); * System.out.println("toHtml(false):"+node.toHtml(false)); * System.out.println("toString:"+node.toString()); * System.out.println * ("================================================="); } */ /* * TextExtractingVisitor visitor = new TextExtractingVisitor(); * parser.visitAllNodesWith(visitor); String textInPage = * visitor.getExtractedText(); System.out.println(textInPage); */ } catch (ParserException e) { e.printStackTrace(); } return links; }
private static NodeList getIFrames(NodeList node_list) { return node_list.extractAllNodesThatMatch(new TagNameFilter("iframe"), true); }
private static NodeList getRemarks(NodeList node_list) { return node_list.extractAllNodesThatMatch(new NodeClassFilter(RemarkNode.class), true); }
private static NodeList getPhotosetPosts(NodeList node_list) { return node_list.extractAllNodesThatMatch( new HasAttributeFilter("class", "my_photoset_post"), true); }
private static boolean handleURL(String address) { Main.status(String.format("Processing page \"%s\".", address)); try { NodeList posts = getPosts(address); if (posts.toNodeArray().length == 0) { return false; } for (Node post_node : posts.toNodeArray()) { if (post_node instanceof TagNode) { TagNode post = (TagNode) post_node; Post new_post = new Post(Long.parseLong(post.getAttribute("id").substring(5))); if (!Main.post_post_hash.containsKey(new_post)) { NodeList photo_posts = getPhotoPosts(post.getChildren()); NodeList remarks = getRemarks(photo_posts); for (Node node : remarks.toNodeArray()) { Matcher matcher = lores.matcher(node.getText()); String media_url = ""; if (matcher.find()) { media_url = matcher.group(); media_url = media_url.substring(17, media_url.length() - 1); } String thumb = media_url.replace( media_url.substring(media_url.lastIndexOf("_"), media_url.lastIndexOf(".")), "_75sq"); URL thumb_url = new URL(thumb); new_post.pictures.add(new Picture(new URL(media_url), thumb_url)); } NodeList photoset_posts = getPhotosetPosts(post.getChildren()); NodeList iframes = getIFrames(photoset_posts); for (Node node : iframes.toNodeArray()) { if (node instanceof TagNode) { String iframe_url = ((TagNode) node).getAttribute("src"); Parser parser2 = new Parser(iframe_url); NodeList a_list = parser2.extractAllNodesThatMatch(new TagNameFilter("a")); Node[] a_array = a_list.toNodeArray(); Node[] img_array = a_list.extractAllNodesThatMatch(new TagNameFilter("img"), true).toNodeArray(); String media_url; for (int i = 0; i < a_array.length; i++) { media_url = ((TagNode) img_array[i]).getAttribute("src"); String thumb = media_url.replace( media_url.substring( media_url.lastIndexOf("_"), media_url.lastIndexOf(".")), "_75sq"); URL thumb_url = new URL(thumb); new_post.pictures.add(new Picture(new URL(media_url), thumb_url)); } } } Main.handlePost(new_post); } else { new_post = post_post_hash.get(new_post); handleNonDownloadPost(new_post); } } } } catch (Exception ex) { ex.printStackTrace(); Main.status("Error handling post."); } return true; }