private Hashtable<String, Integer> extractData(String docUrl, Page page) throws Exception { Hashtable<String, Integer> businessTable = new Hashtable<String, Integer>(); String currentBusiness = ""; Lexer lexer = new Lexer(page); while (true) { Node node = lexer.nextNode(); if (node == null) { break; } if (node instanceof TagNode) { TagNode tagNode = (TagNode) node; if (tagNode.getTagName().equals("A")) { String href = tagNode.getAttribute("href"); if (href != null) { String absUrl = AbsUrlConstructor.construct(docUrl, href); Crawler.dispatchUrl(absUrl); Pattern pBusiness = Pattern.compile("^(http://www.yelp.com/biz/)(\\S)+"); if (pBusiness.matcher(absUrl).matches()) { currentBusiness = extractBusinessName(href); if (!businessTable.containsKey(currentBusiness)) { businessTable.put(currentBusiness, -1); } // System.out.println("currentBusiness = "+currentBusiness); // rating = "4"; // UpdateDatabase(linkID, business, rating, userID); // System.out.println(business + " added."); } } } else if (tagNode.getTagName().equals("IMG")) { String c1ass2 = tagNode.getAttribute("class"); if (c1ass2 != null) { String rating = ""; String[] rate = c1ass2.split("_"); int num = rate.length - 1; if (!rate[num].equals("loader")) { rating = rate[num].trim(); if (businessTable.get(currentBusiness) == -1) { businessTable.put(currentBusiness, Integer.parseInt(rating)); } } // System.out.println(linkID + " " + business + " " + rating + " " + userID; } } } } return businessTable; }
/** * Parses the given text to create the tag contents. * * @param text A string of the form <TAGNAME xx="yy">. */ public void setText(String text) { Lexer lexer; TagNode output; lexer = new Lexer(text); try { output = (TagNode) lexer.nextNode(); mPage = output.getPage(); nodeBegin = output.getStartPosition(); nodeEnd = output.getEndPosition(); mAttributes = output.getAttributesEx(); } catch (ParserException pe) { throw new IllegalArgumentException(pe.getMessage()); } }
/** * Given an input, makes it safe for HTML displaying. Removes any not allowed HTML tag or * attribute, as well unwanted JavaScript statements inside the tags. * * @param contents the input to analyze * @return the modified and safe string */ public String makeSafe(String contents) { if (contents == null || contents.length() == 0) { return contents; } StringBuffer sb = new StringBuffer(contents.length()); try { Lexer lexer = new Lexer(contents); Node node; while ((node = lexer.nextNode()) != null) { boolean isTextNode = node instanceof TextNode; if (isTextNode) { // Text nodes are raw data, so we just // strip off all possible HTML content String text = node.toHtml(); if (text.indexOf('>') > -1 || text.indexOf('<') > -1) { text = text.replaceAll("<", "<"); text = text.replaceAll(">", ">"); text = text.replaceAll("\"", """); node.setText(text); } } if (isTextNode || (node instanceof Tag && this.isTagWelcome(node))) { sb.append(node.toHtml()); } else { String text = node.toHtml(); text = text.replaceAll("<", "<"); text = text.replaceAll(">", ">"); sb.append(text); } } } catch (Exception e) { throw new ForumException("Error while parsing HTML: " + e, e); } return sb.toString(); }
@Override public CompiledTemplate compile( final String template, final ParserConfiguration parserConfiguration) { Source source = new StringSource(template); Page page = new Page(source); Lexer lexer = new Lexer(page); HTMLNodeVisitor visitor = new HTMLNodeVisitor( ehtAttributeprefix, expressionCompiler, inlineCompilers, parserConfiguration); visitor.beginParsing(); try { for (Node node = lexer.nextNode(); node != null; node = lexer.nextNode()) { node.accept(visitor); } } catch (ParserException e) { throw new RuntimeException(e); } visitor.finishedParsing(); return new CompiledTemplateImpl(visitor.getRootNode()); }
/** * Mainline for command line operation * * @param args [0] The URL to parse. * @exception MalformedURLException If the provided URL cannot be resolved. * @exception ParserException If the parse fails. */ public static void main(String[] args) throws MalformedURLException, ParserException { ConnectionManager manager; Lexer lexer; Node node; if (0 >= args.length) { System.out.println("HTML Lexer v" + getVersion() + "\n"); System.out.println(); System.out.println("usage: java -jar htmllexer.jar <url>"); } else { try { manager = Page.getConnectionManager(); lexer = new Lexer(manager.openConnection(args[0])); while (null != (node = lexer.nextNode(false))) System.out.println(node.toString()); } catch (ParserException pe) { System.out.println(pe.getMessage()); if (null != pe.getThrowable()) System.out.println(pe.getThrowable().getMessage()); } } }
/** * Given an input, analyze each HTML tag and remove unsecured attributes from them. * * @param contents The content to verify * @return the content, secure. */ public String ensureAllAttributesAreSafe(String contents) { StringBuffer sb = new StringBuffer(contents.length()); try { Lexer lexer = new Lexer(contents); Node node; while ((node = lexer.nextNode()) != null) { if (node instanceof Tag) { Tag tag = (Tag) node; this.checkAndValidateAttributes(tag, false); sb.append(tag.toHtml()); } else { sb.append(node.toHtml()); } } } catch (Exception e) { throw new ForumException("Problems while parsing HTML: " + e, e); } return sb.toString(); }
public Element nextToken() throws HTMLReaderException { Node node = null; current = null; try { while ((node = lr.nextNode()) != null) { String html = node.toHtml(true).trim(); if (html.length() != 0) { current = new Element(); if (html.startsWith("</")) { current.setType(Element.Type.End); current.setText(html); } else if (html.startsWith("<")) { current.setType(Element.Type.Start); int i = html.indexOf(' '); if (i != -1) { StringBuffer id = new StringBuffer(); StringBuffer value = new StringBuffer(); boolean idSeek = true; boolean valueSeek = false; current.setText(html.substring(1, i)); html = html.substring(i + 1); if (html.endsWith("/>")) { html = html.substring(0, html.length() - 2); current.setOpen(false); } else html = html.substring(0, html.length() - 1); html = html.trim(); for (i = 0; i < html.length(); i++) { char ch = html.charAt(i); if (valueSeek) { if (ch == '"') { current.addAttribute(id.toString(), value.toString()); idSeek = true; valueSeek = false; id.setLength(0); value.setLength(0); } else value.append(ch); } else if (ch == '=') { idSeek = false; } else if (ch == '"') { valueSeek = true; } else id.append(ch); } } else current.setText(html); } else { current.setType(Element.Type.Text); current.setText(html); } return current; } } } catch (ParserException ex) { throw new HTMLReaderException(ex.toString()); } return null; }