/* See if this document, even if it lacks a doctype, is most likely * XHTML. The test is that the document starts with an XML declaration * and has "html" for its first tag. * * Returns: * 0 if there's no XML declaration * 1 if there's an XML declaration but no html tag; in this * case it's probably some other kind of XML * 2 if there's an XML declaration and an html tag * */ protected int seemsToBeXHTML(List elements) { JHElement elem; try { elem = (JHElement) elements.get(0); if (!(elem instanceof JHXmlDecl)) { return 0; } Iterator iter = elements.iterator(); while (iter.hasNext()) { elem = (JHElement) iter.next(); if (elem instanceof JHOpenTag) { JHOpenTag tag = (JHOpenTag) elem; return ("html".equals(tag.getName()) ? 2 : 1); } } } catch (Exception e) { return 0; // document must be really empty } return 1; }
/** * Parse the content of a purported HTML stream digital object and store the results in RepInfo. * * @param stream An InputStream, positioned at its beginning, which is generated from the object * to be parsed. If multiple calls to <code>parse</code> are made on the basis of a nonzero * value being returned, a new InputStream must be provided each time. * @param info A fresh (on the first call) RepInfo object which will be modified to reflect the * results of the parsing If multiple calls to <code>parse</code> are made on the basis of a * nonzero value being returned, the same RepInfo object should be passed with each call. * @param parseIndex Must be 0 in first call to <code>parse</code>. If <code>parse</code> returns * a nonzero value, it must be called again with <code>parseIndex</code> equal to that return * value. */ public int parse(InputStream stream, RepInfo info, int parseIndex) throws IOException { if (parseIndex != 0) { // Coming in with parseIndex = 1 indicates that we've determined // this is XHTML; so we invoke the XML module to parse it. // If parseIndex is 100, this is the first invocation of the // XML module, so we call it with 0; otherwise we call it with // the value of parseIndex. if (isXmlAvailable()) { edu.harvard.hul.ois.jhove.module.XmlModule xmlMod = new edu.harvard.hul.ois.jhove.module.XmlModule(); if (parseIndex == 100) { parseIndex = 0; } xmlMod.setApp(_app); xmlMod.setBase(_je); xmlMod.setDefaultParams(_defaultParams); try { xmlMod.applyDefaultParams(); } catch (Exception e) { // really shouldn't happen } xmlMod.setXhtmlDoctype(_doctype); return xmlMod.parse(stream, info, parseIndex); } else { // The XML module shouldn't be missing from any installation, // but someone who really wanted to could remove it. In // that case, you deserve what you get. info.setMessage(new ErrorMessage("XML-HUL module required to validate XHTML documents")); info.setWellFormed(false); // Treat it as completely wrong return 0; } } else { /* parseIndex = 0, first call only */ _doctype = null; } // Test if textMD is to be generated if (_defaultParams != null) { Iterator iter = _defaultParams.iterator(); while (iter.hasNext()) { String param = (String) iter.next(); if (param.toLowerCase().equals("withtextmd=true")) { _withTextMD = true; } } } initParse(); info.setFormat(_format[0]); info.setMimeType(_mimeType[0]); info.setModule(this); if (_textMD == null || parseIndex == 0) { _textMD = new TextMDMetadata(); } /* We may have already done the checksums while converting a temporary file. */ Checksummer ckSummer = null; if (_je != null && _je.getChecksumFlag() && info.getChecksum().size() == 0) { ckSummer = new Checksummer(); _cstream = new ChecksumInputStream(stream, ckSummer); _dstream = getBufferedDataStream(_cstream, _je != null ? _je.getBufferSize() : 0); } else { _dstream = getBufferedDataStream(stream, _je != null ? _je.getBufferSize() : 0); } ParseHtml parser = null; HtmlMetadata metadata = null; HtmlCharStream cstream = null; try { cstream = new HtmlCharStream(_dstream, "ISO-8859-1"); parser = new ParseHtml(cstream); } catch (UnsupportedEncodingException e) { info.setMessage(new ErrorMessage("Internal error: " + e.getMessage())); info.setWellFormed(false); return 0; // shouldn't happen! } int type = 0; try { List elements = parser.HtmlDoc(); if (elements.isEmpty()) { // Consider an empty document bad info.setWellFormed(false); info.setMessage(new ErrorMessage("Document is empty")); return 0; } type = checkDoctype(elements); if (type < 0) { info.setWellFormed(false); info.setMessage(new ErrorMessage("DOCTYPE is not HTML")); return 0; } /* Check if there is at least one html, head, body or title tag. * A plain text document * might be interpreted as a single PCDATA, which is in some * ethereal sense well-formed HTML, but it's pointless to consider * it such. It might also use angle brackets as a text delimiter, * and that shouldn't count as HTML either. */ boolean hasElements = false; Iterator iter = elements.iterator(); while (iter.hasNext()) { Object o = iter.next(); if (o instanceof JHOpenTag) { String name = ((JHOpenTag) o).getName(); if ("html".equals(name) || "head".equals(name) || "body".equals(name) || "title".equals(name)) { hasElements = true; } break; } } if (!hasElements) { info.setMessage(new ErrorMessage("Document contains no html, head, body or title tags")); info.setWellFormed(false); return 0; } // CRLF from HtmlCharStream ... String lineEnd = cstream.getKindOfLineEnd(); if (lineEnd == null) { info.setMessage(new InfoMessage("Not able to determine type of end of line")); _textMD.setLinebreak(TextMDMetadata.NILL); } else if (lineEnd.equalsIgnoreCase("CR")) { _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CR); } else if (lineEnd.equalsIgnoreCase("LF")) { _textMD.setLinebreak(TextMDMetadata.LINEBREAK_LF); } else if (lineEnd.equalsIgnoreCase("CRLF")) { _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CRLF); } if (type == 0) { /* If we can't find a doctype, it still might be XHTML * if the elements start with an XML declaration and * the root element is "html" */ switch (seemsToBeXHTML(elements)) { case 0: // Not XML break; // fall through case 1: // XML but not HTML info.setMessage( new ErrorMessage( "Document has XML declaration but no DOCTYPE; " + "probably XML rather than HTML")); info.setWellFormed(false); return 0; case 2: // probably XHTML return 100; } info.setMessage( new ErrorMessage( "Unrecognized or missing DOCTYPE declaration; " + "validation continuing as HTML 3.2")); info.setValid(false); // But keep going } HtmlDocDesc docDesc = null; switch (type) { case HTML_3_2: default: docDesc = new Html3_2DocDesc(); _textMD.setMarkup_basis("HTML"); _textMD.setMarkup_basis_version("3.2"); break; case HTML_4_0_FRAMESET: docDesc = new Html4_0FrameDocDesc(); _textMD.setMarkup_basis("HTML"); _textMD.setMarkup_basis_version("4.0"); break; case HTML_4_0_TRANSITIONAL: docDesc = new Html4_0TransDocDesc(); _textMD.setMarkup_basis("HTML"); _textMD.setMarkup_basis_version("4.0"); break; case HTML_4_0_STRICT: docDesc = new Html4_0StrictDocDesc(); _textMD.setMarkup_basis("HTML"); _textMD.setMarkup_basis_version("4.0"); break; case HTML_4_01_FRAMESET: docDesc = new Html4_01FrameDocDesc(); _textMD.setMarkup_basis("HTML"); _textMD.setMarkup_basis_version("4.01"); break; case HTML_4_01_TRANSITIONAL: docDesc = new Html4_01TransDocDesc(); _textMD.setMarkup_basis("HTML"); _textMD.setMarkup_basis_version("4.01"); break; case HTML_4_01_STRICT: docDesc = new Html4_01StrictDocDesc(); _textMD.setMarkup_basis("HTML"); _textMD.setMarkup_basis_version("4.01"); break; case XHTML_1_0_STRICT: case XHTML_1_0_TRANSITIONAL: case XHTML_1_0_FRAMESET: case XHTML_1_1: // Force a second call to parse as XML. 100 is a // magic code for the first XML call. return 100; } _textMD.setMarkup_language(_doctype); if (docDesc == null) { info.setMessage( new InfoMessage( "Code for appropriate HTML version not available yet:" + "substituting HTML 3.2")); docDesc = new Html3_2DocDesc(); } docDesc.validate(elements, info); metadata = docDesc.getMetadata(); // Try to get the charset from the meta Content if (metadata.getCharset() != null) { _textMD.setCharset(metadata.getCharset()); } else { _textMD.setCharset(TextMDMetadata.CHARSET_ISO8859_1); } String textMDEncoding = _textMD.getCharset(); if (textMDEncoding.indexOf("UTF") != -1) { _textMD.setByte_order( _bigEndian ? TextMDMetadata.BYTE_ORDER_BIG : TextMDMetadata.BYTE_ORDER_LITTLE); _textMD.setByte_size("8"); _textMD.setCharacter_size("variable"); } else { _textMD.setByte_order( _bigEndian ? TextMDMetadata.BYTE_ORDER_BIG : TextMDMetadata.BYTE_ORDER_LITTLE); _textMD.setByte_size("8"); _textMD.setCharacter_size("1"); } } catch (ParseException e) { Token t = e.currentToken; info.setMessage( new ErrorMessage("Parse error", "Line = " + t.beginLine + ", column = " + t.beginColumn)); info.setWellFormed(false); } catch (TokenMgrError f) { info.setMessage(new ErrorMessage("TokenMgrError: " + f.getLocalizedMessage())); info.setWellFormed(false); } if (info.getWellFormed() == RepInfo.FALSE) { return 0; } if (type != 0) { if (profileNames[type] != null) { info.setProfile(profileNames[type]); } info.setVersion(versionNames[type]); } if (metadata != null) { Property property = metadata.toProperty(_withTextMD ? _textMD : null); if (property != null) { info.setProperty(property); } } if (ckSummer != null) { info.setSize(_cstream.getNBytes()); info.setChecksum(new Checksum(ckSummer.getCRC32(), ChecksumType.CRC32)); String value = ckSummer.getMD5(); if (value != null) { info.setChecksum(new Checksum(value, ChecksumType.MD5)); } if ((value = ckSummer.getSHA1()) != null) { info.setChecksum(new Checksum(value, ChecksumType.SHA1)); } } return 0; }