/** * Depth first search of a tree node for collecting web page item * * @param doc HTML document * @param item current web page item * @param node current HTML document node * @param url URL of the document to parse */ @SuppressWarnings("unchecked") protected void depthFirstSearch( WebPage doc, WebPageItem item, Link aLink, Node node, URL url, String baseUrl, String hostUrl, Map<String, Map<String, Object>> allStyleMap, int idIndex, int linkNumber) { boolean searchDepth = true; short nodeType = node.getNodeType(); if (nodeType == Node.ELEMENT_NODE) { String nodeName = node.getNodeName().toLowerCase(); // elements defining a new item if (nodeName.equals("p") || nodeName.equals("div") || nodeName.equals("td") || nodeName.equals("th") || nodeName.equals("li") || nodeName.equals("body") || nodeName.equals("form")) { item = new WebPageItem(); Map<String, Object> itemStyles = new HashMap<String, Object>(); Map<String, Object> propElemMap = allStyleMap.get(nodeName); if (propElemMap != null) { itemStyles.putAll(propElemMap); } String itemClass = ((Element) node).getAttribute("class"); if (!itemClass.equals("")) { Map<String, Object> propClassMap = allStyleMap.get("*." + itemClass); if (propClassMap != null) { itemStyles.putAll(propClassMap); } propClassMap = allStyleMap.get(nodeName + "." + itemClass); if (propClassMap != null) { itemStyles.putAll(propClassMap); } } String itemId = ((Element) node).getAttribute("id"); if (itemId.equals("")) { itemId = "tramper" + (idIndex++); } else { Map<String, Object> propIdMap = allStyleMap.get("*#" + itemId); if (propIdMap != null) { itemStyles.putAll(propIdMap); } propIdMap = allStyleMap.get(nodeName + "#" + itemId); if (propIdMap != null) { itemStyles.putAll(propIdMap); } } item.setId(itemId); String itemStyle = ((Element) node).getAttribute("style"); if (!itemStyle.equals("")) { SpeechCSSParser cssParser = new SpeechCSSParser(); // "{" are necessary for CSS parser to work properly: Map<String, Object> propStyleMap = cssParser.parseStyleDeclaration("{" + itemStyle + "}", url.toString()); if (propStyleMap != null) { itemStyles.putAll(propStyleMap); } } Float volume = (Float) itemStyles.get(SpeechCSSParser.VOICE_VOLUME); item.setVolume(volume); Float balance = (Float) itemStyles.get(SpeechCSSParser.VOICE_BALANCE); item.setBalance(balance); String speak = (String) itemStyles.get(SpeechCSSParser.SPEAK); item.setSpeak(speak); Float pauseBefore = (Float) itemStyles.get(SpeechCSSParser.PAUSE_BEFORE); item.setPauseBefore(pauseBefore); Float pauseAfter = (Float) itemStyles.get(SpeechCSSParser.PAUSE_AFTER); item.setPauseAfter(pauseAfter); Float restBefore = (Float) itemStyles.get(SpeechCSSParser.REST_BEFORE); item.setRestBefore(restBefore); Float restAfter = (Float) itemStyles.get(SpeechCSSParser.REST_AFTER); item.setRestAfter(restAfter); URL cueBefore = (URL) itemStyles.get(SpeechCSSParser.CUE_BEFORE); item.setCueBefore(cueBefore); URL cueAfter = (URL) itemStyles.get(SpeechCSSParser.CUE_AFTER); item.setCueAfter(cueAfter); Float rate = (Float) itemStyles.get(SpeechCSSParser.VOICE_RATE); item.setRate(rate); Float pitch = (Float) itemStyles.get(SpeechCSSParser.VOICE_PITCH); item.setPitch(pitch); Float pitchRange = (Float) itemStyles.get(SpeechCSSParser.VOICE_PITCH_RANGE); item.setPitchRange(pitchRange); List<VoiceDesc> voiceFamily = (List<VoiceDesc>) itemStyles.get(SpeechCSSParser.VOICE_FAMILY); item.setVoiceFamily(voiceFamily); doc.addItem(item); } // elements to be replaced by an alternative text else if (nodeName.equals("input")) { String inputType = ((Element) node).getAttribute("type"); if (inputType != null && !inputType.equals("") && !inputType.equalsIgnoreCase("hidden")) { inputType = inputType.toLowerCase(); try { String inputText = label.getString("javaspeaker." + inputType); // read the input value if neither an image nor a checkbox, radio and file input if (!inputType.equals("image") && !inputType.equals("checkbox") && !inputType.equals("radio") && !inputType.equals("file")) { String inputValue = ((Element) node).getAttribute("value"); if (inputValue != null && !inputValue.equals("")) { inputText = inputText.concat(" " + inputValue); } } item.appendContent(inputText); } catch (MissingResourceException e) { // bad type attribute on input } } } else if (nodeName.equals("select")) { String selectString = label.getString("javaspeaker.select"); item.appendContent(selectString); } else if (nodeName.equals("textarea")) { String textareaString = label.getString("javaspeaker.textarea"); item.appendContent(textareaString); } else if (nodeName.equals("img")) { String imgAlt = ((Element) node).getAttribute("alt"); if (imgAlt != null && !"".equals(imgAlt)) { String imgString = label.getString("javaspeaker.image"); imgAlt = imgString.concat(" ").concat(imgAlt); item.appendContent(imgAlt); } } else if (nodeName.equals("a")) { String linkString = label.getString("javaspeaker.link"); String link = linkString.concat(" "); String linkTitle = ((Element) node).getAttribute("title"); if (linkTitle != null && !linkTitle.equals("")) { link = link.concat(linkTitle); } item.appendContent(link); String linkId = ((Element) node).getAttribute("id"); if (linkId.equals("")) { linkId = "tramper" + (idIndex++); } String linkHref = ((Element) node).getAttribute("href"); SimpleDocument aDocument = new SimpleDocument(); URL linkUrl = null; try { linkUrl = completeUrl(linkHref, baseUrl, hostUrl); aDocument.setUrl(linkUrl); } catch (MalformedURLException e) { logger.debug("Bad A href: " + linkUrl); } aLink = new Link(); String linkRelation = ((Element) node).getAttribute("rel"); if (linkRelation != null && !linkRelation.equals("")) { if (linkRelation.equalsIgnoreCase("prev")) { aLink.setRelation("previous"); } else { aLink.setRelation(linkRelation); } } aDocument.setTitle(linkTitle); aLink.setId(linkId); aLink.setLinkedDocument(aDocument); aLink.setLinkingDocument(doc); aLink.setNumber(linkNumber++); item.addLink(aLink); } else if (nodeName.equals("area")) { String areaAlt = ((Element) node).getAttribute("alt"); if (areaAlt != null && !areaAlt.equals("")) { String areaString = label.getString("javaspeaker.imagearea"); areaAlt = areaString.concat(" ").concat(areaAlt); item.appendContent(areaAlt); } } else if (nodeName.equals("frame")) { String frameTitle = ((Element) node).getAttribute("title"); if (frameTitle != null && !frameTitle.equals("")) { String frameString = label.getString("javaspeaker.frame"); frameTitle = frameString.concat(" ").concat(frameTitle); item.appendContent(frameTitle); } } // don't want to parse content of script and style elements else if (nodeName.equals("script") || nodeName.equals("style")) { searchDepth = false; } // embedded media to play else if (nodeName.equals("embed")) { String mediaSrc = ((Element) node).getAttribute("src"); try { URL anUrl = completeUrl(mediaSrc, baseUrl, hostUrl); SimpleDocument aMedia = new SimpleDocument(); aMedia.setUrl(anUrl); String mediaTitle = ((Element) node).getAttribute("title"); if (mediaTitle != null) { aMedia.setTitle(mediaTitle); } Link mediaLink = new Link(); mediaLink.setLinkedDocument(aMedia); mediaLink.setLinkingDocument(doc); mediaLink.setRelation("enclosure"); mediaLink.setNumber(linkNumber++); item.addLink(mediaLink); } catch (MalformedURLException e) { // skip the media creation if bad url logger.debug(e.getMessage()); } } else if (nodeName.equals("bgsound")) { String mediaSrc = ((Element) node).getAttribute("src"); try { URL anUrl = completeUrl(mediaSrc, baseUrl, hostUrl); SimpleDocument aMedia = new SimpleDocument(); aMedia.setUrl(anUrl); String mediaTitle = ((Element) node).getAttribute("title"); if (mediaTitle != null) { aMedia.setTitle(mediaTitle); } Link mediaLink = new Link(); mediaLink.setLinkedDocument(aMedia); mediaLink.setLinkingDocument(doc); mediaLink.setRelation("enclosure"); mediaLink.setNumber(linkNumber++); item.addLink(mediaLink); } catch (MalformedURLException e) { // skip the media creation if bad url logger.debug(e.getMessage()); } } else if (nodeName.equals("object")) { String mediaSrc = ((Element) node).getAttribute("data"); try { URL anUrl = completeUrl(mediaSrc, baseUrl, hostUrl); SimpleDocument aMedia = new SimpleDocument(); aMedia.setUrl(anUrl); String mediaType = ((Element) node).getAttribute("type"); if (mediaType != null) { aMedia.setMimeType(mediaType); } String mediaTitle = ((Element) node).getAttribute("title"); if (mediaTitle != null) { aMedia.setTitle(mediaTitle); } Link mediaLink = new Link(); mediaLink.setLinkedDocument(aMedia); mediaLink.setLinkingDocument(doc); mediaLink.setRelation("enclosure"); mediaLink.setNumber(linkNumber++); item.addLink(mediaLink); } catch (MalformedURLException e) { // skip the media creation if bad url logger.debug(e.getMessage()); } } // parameters for objects elements else if (nodeName.equals("param")) { String paramName = ((Element) node).getAttribute("name"); String paramValue = ((Element) node).getAttribute("value"); List<Link> links = item.getLinks(); if (links.size() > 0) { Link lastLink = links.get(links.size() - 1); SimpleDocument lastMedia = lastLink.getLinkedDocument(); if (paramName.equalsIgnoreCase("src")) { URL mediaUrl = lastMedia.getUrl(); if (mediaUrl == null) { try { mediaUrl = completeUrl(paramValue, baseUrl, hostUrl); lastMedia.setUrl(mediaUrl); } catch (MalformedURLException e) { logger.debug("Bad media url: " + mediaUrl); } } } // do something for "autoplay" and "autoStart"? } } // others elements are ignored } else if (nodeType == Node.TEXT_NODE) { String nodeValue = node.getNodeValue(); if (nodeValue != null && !nodeValue.equals("")) { item.appendContent(nodeValue); } if (aLink != null) { SimpleDocument aDocument = aLink.getLinkedDocument(); String title = aDocument.getTitle(); aDocument.setTitle(title + nodeValue); } } if (searchDepth) { NodeList childNodes = node.getChildNodes(); for (int i = 0; i < childNodes.getLength(); i++) { Node aChild = childNodes.item(i); depthFirstSearch( doc, item, aLink, aChild, url, baseUrl, hostUrl, allStyleMap, idIndex, linkNumber); } } }