/** @see org.tramper.parser.Parser#parse(java.io.InputStream, URL) */ public SimpleDocument parse(InputStream inStream, URL url) throws ParsingException { Map<String, Map<String, Object>> allStyleMap = new HashMap<String, Map<String, Object>>(defaultStyleMap); Document docRoot = this.makeDocument(inStream); WebPage doc = new WebPage(); doc.setMimeType("text/html"); doc.setCharset("UTF-8"); Element root = (Element) docRoot.getElementsByTagName("html").item(0); if (root == null) { logger.error("no html root"); throw new ParsingException("no html root"); } else { String lang = root.getAttribute("lang"); if (lang != null) { doc.parseLanguage(lang); } else { lang = root.getAttribute("xml:lang"); if (lang != null) { doc.parseLanguage(lang); } else { doc.setLanguage(Locale.getDefault()); } } } String hostUrl = null, baseUrl = null; if (url != null) { // first we determine the base url and the host part hostUrl = url.getProtocol() + "://" + url.getHost(); int port = url.getPort(); if (port != -1) { hostUrl += ":" + port; } String path = url.getPath(); int lastSlashIndex = path.lastIndexOf("/"); if (lastSlashIndex != -1) { path = path.substring(0, lastSlashIndex + 1); } else { path = "/"; } baseUrl = hostUrl + path; } NodeList titleList = docRoot.getElementsByTagName("title"); if (titleList.getLength() > 0) { Node title = titleList.item(0); Node titleText = title.getFirstChild(); if (titleText != null) { doc.setTitle(titleText.getNodeValue()); } } NodeList metaList = docRoot.getElementsByTagName("meta"); for (int i = 0; i < metaList.getLength(); i++) { Element meta = (Element) metaList.item(i); String metaName = meta.getAttribute("name"); String metaHttpEquiv = meta.getAttribute("http-equiv"); String metaContent = meta.getAttribute("content"); if (metaName != null && !metaName.equals("")) { if (metaName.equalsIgnoreCase("description")) doc.setDescription(metaContent); else if (metaName.equalsIgnoreCase("author")) doc.setAuthor(metaContent); else if (metaName.equalsIgnoreCase("category")) doc.setCategory(metaContent); else if (metaName.equalsIgnoreCase("copyright")) doc.setCopyright(metaContent); } else if (metaHttpEquiv != null && !metaHttpEquiv.equals("")) { if (metaHttpEquiv.equalsIgnoreCase("content-type")) { if (metaContent != null) { // manage the charset if present int semiColumnIndex = metaContent.indexOf(";"); if (semiColumnIndex != -1) { int equalIndex = metaContent.indexOf("="); if (equalIndex != -1) { String metaCharset = metaContent.substring(equalIndex + 1); doc.setCharset(metaCharset); } metaContent = metaContent.substring(0, semiColumnIndex); } doc.setMimeType(metaContent); } } else if (metaHttpEquiv.equalsIgnoreCase("content-language")) { if (metaContent != null) doc.parseLanguage(metaContent); } } } SpeechCSSParser cssParser = new SpeechCSSParser(); NodeList metaLink = docRoot.getElementsByTagName("link"); for (int i = 0; i < metaLink.getLength(); i++) { Element link = (Element) metaLink.item(i); String linkHref = link.getAttribute("href"); try { URL anUrl = completeUrl(linkHref, baseUrl, hostUrl); if (anUrl != null) { String linkTitle = link.getAttribute("title"); String linkRel = link.getAttribute("rel"); String linkType = link.getAttribute("type"); if (linkRel.equalsIgnoreCase("icon") || linkRel.equalsIgnoreCase("shortcut icon")) { try { Image img = ImageIO.read(anUrl); doc.setIcon(img); } catch (Exception e) { logger.debug("unreadable icon: " + anUrl); } } else if (linkRel.equalsIgnoreCase("stylesheet")) { if ("text/css".equals(linkType)) { String media = link.getAttribute("media").toLowerCase(); if (media.equals("") || media.equals("speech") || media.equals("aural")) { // load a CSS style sheet Map<String, Map<String, Object>> styleMap = cssParser.parse(anUrl); // don't overwrite a default style if not overloaded for an existing element Iterator<Entry<String, Map<String, Object>>> entryIt = styleMap.entrySet().iterator(); while (entryIt.hasNext()) { Map.Entry<String, Map<String, Object>> entry = entryIt.next(); String key = entry.getKey(); Map<String, Object> value = entry.getValue(); if (allStyleMap.containsKey(key)) { Map<String, Object> elementStyles = allStyleMap.get(key); elementStyles.putAll(value); } else { allStyleMap.put(key, value); } } } } } else { SimpleDocument aDocument = new SimpleDocument(); aDocument.setUrl(anUrl); aDocument.setTitle(linkTitle); aDocument.setMimeType(linkType); Link aLink = new Link(); aLink.setLinkedDocument(aDocument); aLink.setLinkingDocument(doc); if (linkRel.equalsIgnoreCase("prev")) { aLink.setRelation("previous"); } else { aLink.setRelation(linkRel); } doc.addLink(aLink); } } } catch (MalformedURLException e) { logger.debug(e.getMessage()); } } NodeList styles = docRoot.getElementsByTagName("style"); for (int i = 0; i < styles.getLength(); i++) { Element style = (Element) styles.item(i); String text = style.getTextContent(); Map<String, Map<String, Object>> styleMap = cssParser.parse(text, url.toString()); // don't overwrite a default style if not overloaded for an existing element Iterator<Entry<String, Map<String, Object>>> entryIt = styleMap.entrySet().iterator(); while (entryIt.hasNext()) { Map.Entry<String, Map<String, Object>> entry = entryIt.next(); String key = entry.getKey(); Map<String, Object> value = entry.getValue(); if (allStyleMap.containsKey(key)) { Map<String, Object> elementStyles = allStyleMap.get(key); elementStyles.putAll(value); } else { allStyleMap.put(key, value); } } } Node body = docRoot.getElementsByTagName("body").item(0); if (body == null) { throw new ParsingException("no body element in HTML document"); } int linkNumber = 1; try { label = ResourceBundle.getBundle("label", doc.getLanguage()); } catch (Exception e) { // either the document language is null, or there is no properties file for it label = ResourceBundle.getBundle("label"); } int idIndex = 1; this.depthFirstSearch( doc, null, null, body, url, baseUrl, hostUrl, allStyleMap, idIndex, linkNumber); return doc; }