private Hashtable<String, Integer> extractData(String docUrl, Page page) throws Exception {
    Hashtable<String, Integer> businessTable = new Hashtable<String, Integer>();
    String currentBusiness = "";
    Lexer lexer = new Lexer(page);
    while (true) {
      Node node = lexer.nextNode();
      if (node == null) {
        break;
      }

      if (node instanceof TagNode) {
        TagNode tagNode = (TagNode) node;
        if (tagNode.getTagName().equals("A")) {
          String href = tagNode.getAttribute("href");
          if (href != null) {
            String absUrl = AbsUrlConstructor.construct(docUrl, href);

            Crawler.dispatchUrl(absUrl);

            Pattern pBusiness = Pattern.compile("^(http://www.yelp.com/biz/)(\\S)+");
            if (pBusiness.matcher(absUrl).matches()) {
              currentBusiness = extractBusinessName(href);
              if (!businessTable.containsKey(currentBusiness)) {
                businessTable.put(currentBusiness, -1);
              }
              //							System.out.println("currentBusiness = "+currentBusiness);
              // rating = "4";
              // UpdateDatabase(linkID, business, rating, userID);
              // System.out.println(business + " added.");
            }
          }
        } else if (tagNode.getTagName().equals("IMG")) {
          String c1ass2 = tagNode.getAttribute("class");
          if (c1ass2 != null) {
            String rating = "";

            String[] rate = c1ass2.split("_");
            int num = rate.length - 1;
            if (!rate[num].equals("loader")) {
              rating = rate[num].trim();
              if (businessTable.get(currentBusiness) == -1) {
                businessTable.put(currentBusiness, Integer.parseInt(rating));
              }
            }
            // System.out.println(linkID + " " + business + " " + rating + " " + userID;

          }
        }
      }
    }

    return businessTable;
  }
Esempio n. 2
0
  /**
   * Parses the given text to create the tag contents.
   *
   * @param text A string of the form &lt;TAGNAME xx="yy"&gt;.
   */
  public void setText(String text) {
    Lexer lexer;
    TagNode output;

    lexer = new Lexer(text);
    try {
      output = (TagNode) lexer.nextNode();
      mPage = output.getPage();
      nodeBegin = output.getStartPosition();
      nodeEnd = output.getEndPosition();
      mAttributes = output.getAttributesEx();
    } catch (ParserException pe) {
      throw new IllegalArgumentException(pe.getMessage());
    }
  }
Esempio n. 3
0
  /**
   * Given an input, makes it safe for HTML displaying. Removes any not allowed HTML tag or
   * attribute, as well unwanted JavaScript statements inside the tags.
   *
   * @param contents the input to analyze
   * @return the modified and safe string
   */
  public String makeSafe(String contents) {
    if (contents == null || contents.length() == 0) {
      return contents;
    }

    StringBuffer sb = new StringBuffer(contents.length());

    try {
      Lexer lexer = new Lexer(contents);
      Node node;

      while ((node = lexer.nextNode()) != null) {
        boolean isTextNode = node instanceof TextNode;

        if (isTextNode) {
          // Text nodes are raw data, so we just
          // strip off all possible HTML content
          String text = node.toHtml();

          if (text.indexOf('>') > -1 || text.indexOf('<') > -1) {
            text = text.replaceAll("<", "&lt;");
            text = text.replaceAll(">", "&gt;");
            text = text.replaceAll("\"", "&quot;");

            node.setText(text);
          }
        }

        if (isTextNode || (node instanceof Tag && this.isTagWelcome(node))) {
          sb.append(node.toHtml());
        } else {
          String text = node.toHtml();

          text = text.replaceAll("<", "&lt;");
          text = text.replaceAll(">", "&gt;");

          sb.append(text);
        }
      }
    } catch (Exception e) {
      throw new ForumException("Error while parsing HTML: " + e, e);
    }

    return sb.toString();
  }
 @Override
 public CompiledTemplate compile(
     final String template, final ParserConfiguration parserConfiguration) {
   Source source = new StringSource(template);
   Page page = new Page(source);
   Lexer lexer = new Lexer(page);
   HTMLNodeVisitor visitor =
       new HTMLNodeVisitor(
           ehtAttributeprefix, expressionCompiler, inlineCompilers, parserConfiguration);
   visitor.beginParsing();
   try {
     for (Node node = lexer.nextNode(); node != null; node = lexer.nextNode()) {
       node.accept(visitor);
     }
   } catch (ParserException e) {
     throw new RuntimeException(e);
   }
   visitor.finishedParsing();
   return new CompiledTemplateImpl(visitor.getRootNode());
 }
Esempio n. 5
0
  /**
   * Mainline for command line operation
   *
   * @param args [0] The URL to parse.
   * @exception MalformedURLException If the provided URL cannot be resolved.
   * @exception ParserException If the parse fails.
   */
  public static void main(String[] args) throws MalformedURLException, ParserException {
    ConnectionManager manager;
    Lexer lexer;
    Node node;

    if (0 >= args.length) {
      System.out.println("HTML Lexer v" + getVersion() + "\n");
      System.out.println();
      System.out.println("usage: java -jar htmllexer.jar <url>");
    } else {
      try {
        manager = Page.getConnectionManager();
        lexer = new Lexer(manager.openConnection(args[0]));
        while (null != (node = lexer.nextNode(false))) System.out.println(node.toString());
      } catch (ParserException pe) {
        System.out.println(pe.getMessage());
        if (null != pe.getThrowable()) System.out.println(pe.getThrowable().getMessage());
      }
    }
  }
Esempio n. 6
0
  /**
   * Given an input, analyze each HTML tag and remove unsecured attributes from them.
   *
   * @param contents The content to verify
   * @return the content, secure.
   */
  public String ensureAllAttributesAreSafe(String contents) {
    StringBuffer sb = new StringBuffer(contents.length());

    try {
      Lexer lexer = new Lexer(contents);
      Node node;

      while ((node = lexer.nextNode()) != null) {
        if (node instanceof Tag) {
          Tag tag = (Tag) node;

          this.checkAndValidateAttributes(tag, false);

          sb.append(tag.toHtml());
        } else {
          sb.append(node.toHtml());
        }
      }
    } catch (Exception e) {
      throw new ForumException("Problems while parsing HTML: " + e, e);
    }

    return sb.toString();
  }
Esempio n. 7
0
  public Element nextToken() throws HTMLReaderException {
    Node node = null;

    current = null;

    try {
      while ((node = lr.nextNode()) != null) {
        String html = node.toHtml(true).trim();

        if (html.length() != 0) {
          current = new Element();

          if (html.startsWith("</")) {
            current.setType(Element.Type.End);
            current.setText(html);
          } else if (html.startsWith("<")) {
            current.setType(Element.Type.Start);

            int i = html.indexOf(' ');

            if (i != -1) {
              StringBuffer id = new StringBuffer();
              StringBuffer value = new StringBuffer();
              boolean idSeek = true;
              boolean valueSeek = false;

              current.setText(html.substring(1, i));
              html = html.substring(i + 1);

              if (html.endsWith("/>")) {
                html = html.substring(0, html.length() - 2);
                current.setOpen(false);
              } else html = html.substring(0, html.length() - 1);

              html = html.trim();

              for (i = 0; i < html.length(); i++) {
                char ch = html.charAt(i);

                if (valueSeek) {
                  if (ch == '"') {
                    current.addAttribute(id.toString(), value.toString());
                    idSeek = true;
                    valueSeek = false;
                    id.setLength(0);
                    value.setLength(0);
                  } else value.append(ch);

                } else if (ch == '=') {
                  idSeek = false;
                } else if (ch == '"') {
                  valueSeek = true;
                } else id.append(ch);
              }
            } else current.setText(html);
          } else {
            current.setType(Element.Type.Text);
            current.setText(html);
          }

          return current;
        }
      }
    } catch (ParserException ex) {
      throw new HTMLReaderException(ex.toString());
    }
    return null;
  }