Пример #1
0
  /** @see org.tramper.parser.Parser#parse(java.io.InputStream, URL) */
  public SimpleDocument parse(InputStream inStream, URL url) throws ParsingException {
    Map<String, Map<String, Object>> allStyleMap =
        new HashMap<String, Map<String, Object>>(defaultStyleMap);
    Document docRoot = this.makeDocument(inStream);

    WebPage doc = new WebPage();
    doc.setMimeType("text/html");
    doc.setCharset("UTF-8");

    Element root = (Element) docRoot.getElementsByTagName("html").item(0);
    if (root == null) {
      logger.error("no html root");
      throw new ParsingException("no html root");
    } else {
      String lang = root.getAttribute("lang");
      if (lang != null) {
        doc.parseLanguage(lang);
      } else {
        lang = root.getAttribute("xml:lang");
        if (lang != null) {
          doc.parseLanguage(lang);
        } else {
          doc.setLanguage(Locale.getDefault());
        }
      }
    }

    String hostUrl = null, baseUrl = null;
    if (url != null) {
      // first we determine the base url and the host part
      hostUrl = url.getProtocol() + "://" + url.getHost();
      int port = url.getPort();
      if (port != -1) {
        hostUrl += ":" + port;
      }
      String path = url.getPath();
      int lastSlashIndex = path.lastIndexOf("/");
      if (lastSlashIndex != -1) {
        path = path.substring(0, lastSlashIndex + 1);
      } else {
        path = "/";
      }
      baseUrl = hostUrl + path;
    }

    NodeList titleList = docRoot.getElementsByTagName("title");
    if (titleList.getLength() > 0) {
      Node title = titleList.item(0);
      Node titleText = title.getFirstChild();
      if (titleText != null) {
        doc.setTitle(titleText.getNodeValue());
      }
    }

    NodeList metaList = docRoot.getElementsByTagName("meta");
    for (int i = 0; i < metaList.getLength(); i++) {
      Element meta = (Element) metaList.item(i);
      String metaName = meta.getAttribute("name");
      String metaHttpEquiv = meta.getAttribute("http-equiv");
      String metaContent = meta.getAttribute("content");
      if (metaName != null && !metaName.equals("")) {
        if (metaName.equalsIgnoreCase("description")) doc.setDescription(metaContent);
        else if (metaName.equalsIgnoreCase("author")) doc.setAuthor(metaContent);
        else if (metaName.equalsIgnoreCase("category")) doc.setCategory(metaContent);
        else if (metaName.equalsIgnoreCase("copyright")) doc.setCopyright(metaContent);
      } else if (metaHttpEquiv != null && !metaHttpEquiv.equals("")) {
        if (metaHttpEquiv.equalsIgnoreCase("content-type")) {
          if (metaContent != null) {
            // manage the charset if present
            int semiColumnIndex = metaContent.indexOf(";");
            if (semiColumnIndex != -1) {
              int equalIndex = metaContent.indexOf("=");
              if (equalIndex != -1) {
                String metaCharset = metaContent.substring(equalIndex + 1);
                doc.setCharset(metaCharset);
              }
              metaContent = metaContent.substring(0, semiColumnIndex);
            }
            doc.setMimeType(metaContent);
          }
        } else if (metaHttpEquiv.equalsIgnoreCase("content-language")) {
          if (metaContent != null) doc.parseLanguage(metaContent);
        }
      }
    }

    SpeechCSSParser cssParser = new SpeechCSSParser();

    NodeList metaLink = docRoot.getElementsByTagName("link");
    for (int i = 0; i < metaLink.getLength(); i++) {
      Element link = (Element) metaLink.item(i);
      String linkHref = link.getAttribute("href");
      try {
        URL anUrl = completeUrl(linkHref, baseUrl, hostUrl);
        if (anUrl != null) {
          String linkTitle = link.getAttribute("title");
          String linkRel = link.getAttribute("rel");
          String linkType = link.getAttribute("type");
          if (linkRel.equalsIgnoreCase("icon") || linkRel.equalsIgnoreCase("shortcut icon")) {
            try {
              Image img = ImageIO.read(anUrl);
              doc.setIcon(img);
            } catch (Exception e) {
              logger.debug("unreadable icon: " + anUrl);
            }
          } else if (linkRel.equalsIgnoreCase("stylesheet")) {
            if ("text/css".equals(linkType)) {
              String media = link.getAttribute("media").toLowerCase();
              if (media.equals("") || media.equals("speech") || media.equals("aural")) {
                // load a CSS style sheet
                Map<String, Map<String, Object>> styleMap = cssParser.parse(anUrl);
                // don't overwrite a default style if not overloaded for an existing element
                Iterator<Entry<String, Map<String, Object>>> entryIt =
                    styleMap.entrySet().iterator();
                while (entryIt.hasNext()) {
                  Map.Entry<String, Map<String, Object>> entry = entryIt.next();
                  String key = entry.getKey();
                  Map<String, Object> value = entry.getValue();
                  if (allStyleMap.containsKey(key)) {
                    Map<String, Object> elementStyles = allStyleMap.get(key);
                    elementStyles.putAll(value);
                  } else {
                    allStyleMap.put(key, value);
                  }
                }
              }
            }
          } else {
            SimpleDocument aDocument = new SimpleDocument();
            aDocument.setUrl(anUrl);
            aDocument.setTitle(linkTitle);
            aDocument.setMimeType(linkType);

            Link aLink = new Link();
            aLink.setLinkedDocument(aDocument);
            aLink.setLinkingDocument(doc);
            if (linkRel.equalsIgnoreCase("prev")) {
              aLink.setRelation("previous");
            } else {
              aLink.setRelation(linkRel);
            }
            doc.addLink(aLink);
          }
        }
      } catch (MalformedURLException e) {
        logger.debug(e.getMessage());
      }
    }

    NodeList styles = docRoot.getElementsByTagName("style");
    for (int i = 0; i < styles.getLength(); i++) {
      Element style = (Element) styles.item(i);
      String text = style.getTextContent();
      Map<String, Map<String, Object>> styleMap = cssParser.parse(text, url.toString());
      // don't overwrite a default style if not overloaded for an existing element
      Iterator<Entry<String, Map<String, Object>>> entryIt = styleMap.entrySet().iterator();
      while (entryIt.hasNext()) {
        Map.Entry<String, Map<String, Object>> entry = entryIt.next();
        String key = entry.getKey();
        Map<String, Object> value = entry.getValue();
        if (allStyleMap.containsKey(key)) {
          Map<String, Object> elementStyles = allStyleMap.get(key);
          elementStyles.putAll(value);
        } else {
          allStyleMap.put(key, value);
        }
      }
    }

    Node body = docRoot.getElementsByTagName("body").item(0);
    if (body == null) {
      throw new ParsingException("no body element in HTML document");
    }

    int linkNumber = 1;
    try {
      label = ResourceBundle.getBundle("label", doc.getLanguage());
    } catch (Exception e) {
      // either the document language is null, or there is no properties file for it
      label = ResourceBundle.getBundle("label");
    }
    int idIndex = 1;
    this.depthFirstSearch(
        doc, null, null, body, url, baseUrl, hostUrl, allStyleMap, idIndex, linkNumber);
    return doc;
  }
Пример #2
0
  /**
   * Depth first search of a tree node for collecting web page item
   *
   * @param doc HTML document
   * @param item current web page item
   * @param node current HTML document node
   * @param url URL of the document to parse
   */
  @SuppressWarnings("unchecked")
  protected void depthFirstSearch(
      WebPage doc,
      WebPageItem item,
      Link aLink,
      Node node,
      URL url,
      String baseUrl,
      String hostUrl,
      Map<String, Map<String, Object>> allStyleMap,
      int idIndex,
      int linkNumber) {
    boolean searchDepth = true;

    short nodeType = node.getNodeType();

    if (nodeType == Node.ELEMENT_NODE) {
      String nodeName = node.getNodeName().toLowerCase();
      // elements defining a new item
      if (nodeName.equals("p")
          || nodeName.equals("div")
          || nodeName.equals("td")
          || nodeName.equals("th")
          || nodeName.equals("li")
          || nodeName.equals("body")
          || nodeName.equals("form")) {
        item = new WebPageItem();
        Map<String, Object> itemStyles = new HashMap<String, Object>();
        Map<String, Object> propElemMap = allStyleMap.get(nodeName);
        if (propElemMap != null) {
          itemStyles.putAll(propElemMap);
        }

        String itemClass = ((Element) node).getAttribute("class");
        if (!itemClass.equals("")) {
          Map<String, Object> propClassMap = allStyleMap.get("*." + itemClass);
          if (propClassMap != null) {
            itemStyles.putAll(propClassMap);
          }
          propClassMap = allStyleMap.get(nodeName + "." + itemClass);
          if (propClassMap != null) {
            itemStyles.putAll(propClassMap);
          }
        }

        String itemId = ((Element) node).getAttribute("id");
        if (itemId.equals("")) {
          itemId = "tramper" + (idIndex++);
        } else {
          Map<String, Object> propIdMap = allStyleMap.get("*#" + itemId);
          if (propIdMap != null) {
            itemStyles.putAll(propIdMap);
          }
          propIdMap = allStyleMap.get(nodeName + "#" + itemId);
          if (propIdMap != null) {
            itemStyles.putAll(propIdMap);
          }
        }
        item.setId(itemId);

        String itemStyle = ((Element) node).getAttribute("style");
        if (!itemStyle.equals("")) {
          SpeechCSSParser cssParser = new SpeechCSSParser();
          // "{" are necessary for CSS parser to work properly:
          Map<String, Object> propStyleMap =
              cssParser.parseStyleDeclaration("{" + itemStyle + "}", url.toString());
          if (propStyleMap != null) {
            itemStyles.putAll(propStyleMap);
          }
        }

        Float volume = (Float) itemStyles.get(SpeechCSSParser.VOICE_VOLUME);
        item.setVolume(volume);
        Float balance = (Float) itemStyles.get(SpeechCSSParser.VOICE_BALANCE);
        item.setBalance(balance);
        String speak = (String) itemStyles.get(SpeechCSSParser.SPEAK);
        item.setSpeak(speak);
        Float pauseBefore = (Float) itemStyles.get(SpeechCSSParser.PAUSE_BEFORE);
        item.setPauseBefore(pauseBefore);
        Float pauseAfter = (Float) itemStyles.get(SpeechCSSParser.PAUSE_AFTER);
        item.setPauseAfter(pauseAfter);
        Float restBefore = (Float) itemStyles.get(SpeechCSSParser.REST_BEFORE);
        item.setRestBefore(restBefore);
        Float restAfter = (Float) itemStyles.get(SpeechCSSParser.REST_AFTER);
        item.setRestAfter(restAfter);
        URL cueBefore = (URL) itemStyles.get(SpeechCSSParser.CUE_BEFORE);
        item.setCueBefore(cueBefore);
        URL cueAfter = (URL) itemStyles.get(SpeechCSSParser.CUE_AFTER);
        item.setCueAfter(cueAfter);
        Float rate = (Float) itemStyles.get(SpeechCSSParser.VOICE_RATE);
        item.setRate(rate);
        Float pitch = (Float) itemStyles.get(SpeechCSSParser.VOICE_PITCH);
        item.setPitch(pitch);
        Float pitchRange = (Float) itemStyles.get(SpeechCSSParser.VOICE_PITCH_RANGE);
        item.setPitchRange(pitchRange);
        List<VoiceDesc> voiceFamily =
            (List<VoiceDesc>) itemStyles.get(SpeechCSSParser.VOICE_FAMILY);
        item.setVoiceFamily(voiceFamily);

        doc.addItem(item);
      }
      // elements to be replaced by an alternative text
      else if (nodeName.equals("input")) {
        String inputType = ((Element) node).getAttribute("type");
        if (inputType != null && !inputType.equals("") && !inputType.equalsIgnoreCase("hidden")) {
          inputType = inputType.toLowerCase();
          try {
            String inputText = label.getString("javaspeaker." + inputType);
            // read the input value if neither an image nor a checkbox, radio and file input
            if (!inputType.equals("image")
                && !inputType.equals("checkbox")
                && !inputType.equals("radio")
                && !inputType.equals("file")) {
              String inputValue = ((Element) node).getAttribute("value");
              if (inputValue != null && !inputValue.equals("")) {
                inputText = inputText.concat(" " + inputValue);
              }
            }
            item.appendContent(inputText);
          } catch (MissingResourceException e) {
            // bad type attribute on input
          }
        }
      } else if (nodeName.equals("select")) {
        String selectString = label.getString("javaspeaker.select");
        item.appendContent(selectString);
      } else if (nodeName.equals("textarea")) {
        String textareaString = label.getString("javaspeaker.textarea");
        item.appendContent(textareaString);
      } else if (nodeName.equals("img")) {
        String imgAlt = ((Element) node).getAttribute("alt");
        if (imgAlt != null && !"".equals(imgAlt)) {
          String imgString = label.getString("javaspeaker.image");
          imgAlt = imgString.concat(" ").concat(imgAlt);
          item.appendContent(imgAlt);
        }
      } else if (nodeName.equals("a")) {
        String linkString = label.getString("javaspeaker.link");
        String link = linkString.concat(" ");
        String linkTitle = ((Element) node).getAttribute("title");
        if (linkTitle != null && !linkTitle.equals("")) {
          link = link.concat(linkTitle);
        }
        item.appendContent(link);

        String linkId = ((Element) node).getAttribute("id");
        if (linkId.equals("")) {
          linkId = "tramper" + (idIndex++);
        }

        String linkHref = ((Element) node).getAttribute("href");
        SimpleDocument aDocument = new SimpleDocument();
        URL linkUrl = null;
        try {
          linkUrl = completeUrl(linkHref, baseUrl, hostUrl);
          aDocument.setUrl(linkUrl);
        } catch (MalformedURLException e) {
          logger.debug("Bad A href: " + linkUrl);
        }

        aLink = new Link();
        String linkRelation = ((Element) node).getAttribute("rel");
        if (linkRelation != null && !linkRelation.equals("")) {
          if (linkRelation.equalsIgnoreCase("prev")) {
            aLink.setRelation("previous");
          } else {
            aLink.setRelation(linkRelation);
          }
        }

        aDocument.setTitle(linkTitle);
        aLink.setId(linkId);
        aLink.setLinkedDocument(aDocument);
        aLink.setLinkingDocument(doc);
        aLink.setNumber(linkNumber++);
        item.addLink(aLink);
      } else if (nodeName.equals("area")) {
        String areaAlt = ((Element) node).getAttribute("alt");
        if (areaAlt != null && !areaAlt.equals("")) {
          String areaString = label.getString("javaspeaker.imagearea");
          areaAlt = areaString.concat(" ").concat(areaAlt);
          item.appendContent(areaAlt);
        }
      } else if (nodeName.equals("frame")) {
        String frameTitle = ((Element) node).getAttribute("title");
        if (frameTitle != null && !frameTitle.equals("")) {
          String frameString = label.getString("javaspeaker.frame");
          frameTitle = frameString.concat(" ").concat(frameTitle);
          item.appendContent(frameTitle);
        }
      }
      // don't want to parse content of script and style elements
      else if (nodeName.equals("script") || nodeName.equals("style")) {
        searchDepth = false;
      }
      // embedded media to play
      else if (nodeName.equals("embed")) {
        String mediaSrc = ((Element) node).getAttribute("src");
        try {
          URL anUrl = completeUrl(mediaSrc, baseUrl, hostUrl);
          SimpleDocument aMedia = new SimpleDocument();
          aMedia.setUrl(anUrl);
          String mediaTitle = ((Element) node).getAttribute("title");
          if (mediaTitle != null) {
            aMedia.setTitle(mediaTitle);
          }
          Link mediaLink = new Link();
          mediaLink.setLinkedDocument(aMedia);
          mediaLink.setLinkingDocument(doc);
          mediaLink.setRelation("enclosure");
          mediaLink.setNumber(linkNumber++);
          item.addLink(mediaLink);
        } catch (MalformedURLException e) {
          // skip the media creation if bad url
          logger.debug(e.getMessage());
        }
      } else if (nodeName.equals("bgsound")) {
        String mediaSrc = ((Element) node).getAttribute("src");
        try {
          URL anUrl = completeUrl(mediaSrc, baseUrl, hostUrl);
          SimpleDocument aMedia = new SimpleDocument();
          aMedia.setUrl(anUrl);
          String mediaTitle = ((Element) node).getAttribute("title");
          if (mediaTitle != null) {
            aMedia.setTitle(mediaTitle);
          }
          Link mediaLink = new Link();
          mediaLink.setLinkedDocument(aMedia);
          mediaLink.setLinkingDocument(doc);
          mediaLink.setRelation("enclosure");
          mediaLink.setNumber(linkNumber++);
          item.addLink(mediaLink);
        } catch (MalformedURLException e) {
          // skip the media creation if bad url
          logger.debug(e.getMessage());
        }
      } else if (nodeName.equals("object")) {
        String mediaSrc = ((Element) node).getAttribute("data");
        try {
          URL anUrl = completeUrl(mediaSrc, baseUrl, hostUrl);
          SimpleDocument aMedia = new SimpleDocument();
          aMedia.setUrl(anUrl);
          String mediaType = ((Element) node).getAttribute("type");
          if (mediaType != null) {
            aMedia.setMimeType(mediaType);
          }
          String mediaTitle = ((Element) node).getAttribute("title");
          if (mediaTitle != null) {
            aMedia.setTitle(mediaTitle);
          }
          Link mediaLink = new Link();
          mediaLink.setLinkedDocument(aMedia);
          mediaLink.setLinkingDocument(doc);
          mediaLink.setRelation("enclosure");
          mediaLink.setNumber(linkNumber++);
          item.addLink(mediaLink);
        } catch (MalformedURLException e) {
          // skip the media creation if bad url
          logger.debug(e.getMessage());
        }
      }
      // parameters for objects elements
      else if (nodeName.equals("param")) {
        String paramName = ((Element) node).getAttribute("name");
        String paramValue = ((Element) node).getAttribute("value");
        List<Link> links = item.getLinks();
        if (links.size() > 0) {
          Link lastLink = links.get(links.size() - 1);
          SimpleDocument lastMedia = lastLink.getLinkedDocument();
          if (paramName.equalsIgnoreCase("src")) {
            URL mediaUrl = lastMedia.getUrl();
            if (mediaUrl == null) {
              try {
                mediaUrl = completeUrl(paramValue, baseUrl, hostUrl);
                lastMedia.setUrl(mediaUrl);
              } catch (MalformedURLException e) {
                logger.debug("Bad media url: " + mediaUrl);
              }
            }
          } // do something for "autoplay" and "autoStart"?
        }
      }
      // others elements are ignored
    } else if (nodeType == Node.TEXT_NODE) {
      String nodeValue = node.getNodeValue();
      if (nodeValue != null && !nodeValue.equals("")) {
        item.appendContent(nodeValue);
      }
      if (aLink != null) {
        SimpleDocument aDocument = aLink.getLinkedDocument();
        String title = aDocument.getTitle();
        aDocument.setTitle(title + nodeValue);
      }
    }

    if (searchDepth) {
      NodeList childNodes = node.getChildNodes();
      for (int i = 0; i < childNodes.getLength(); i++) {
        Node aChild = childNodes.item(i);
        depthFirstSearch(
            doc, item, aLink, aChild, url, baseUrl, hostUrl, allStyleMap, idIndex, linkNumber);
      }
    }
  }