/** * Check if the digital object conforms to this Module's internal signature information. * * <p>HTML is one of the most ill-defined of any open formats, so checking a "signature" really * means using some heuristics. The only required tag is TITLE, but that could occur well into the * file. So we look for any of three strings -- taking into account case-independence and white * space -- within the first sigBytes bytes, and call that a signature check. * * @param file A File object for the object being parsed * @param stream An InputStream, positioned at its beginning, which is generated from the object * to be parsed * @param info A fresh RepInfo object which will be modified to reflect the results of the test */ public void checkSignatures(File file, InputStream stream, RepInfo info) throws IOException { info.setFormat(_format[0]); info.setMimeType(_mimeType[0]); info.setModule(this); char[][] sigtext = new char[3][]; sigtext[0] = "<!DOCTYPE HTML".toCharArray(); sigtext[1] = "<HTML".toCharArray(); sigtext[2] = "<TITLE".toCharArray(); int[] sigstate = {0, 0, 0}; JhoveBase jb = getBase(); int sigBytes = jb.getSigBytes(); int bytesRead = 0; boolean eof = false; DataInputStream dstream = new DataInputStream(stream); while (!eof && bytesRead < sigBytes) { try { int ch = readUnsignedByte(dstream, this); char chr = Character.toUpperCase((char) ch); ++bytesRead; if (Character.isWhitespace(chr)) { continue; // ignore all whitespace } for (int i = 0; i < 3; i++) { int ss = sigstate[i]; char[] st = sigtext[i]; if (chr == st[ss]) { ++sigstate[i]; if (sigstate[i] == st.length) { // One of the sig texts matches! info.setSigMatch(_name); return; } } else sigstate[i] = 0; } } catch (EOFException e) { eof = true; } } // If we fall through, there was no sig match info.setWellFormed(false); return; }
/** * Parse the content of a purported HTML stream digital object and store the results in RepInfo. * * @param stream An InputStream, positioned at its beginning, which is generated from the object * to be parsed. If multiple calls to <code>parse</code> are made on the basis of a nonzero * value being returned, a new InputStream must be provided each time. * @param info A fresh (on the first call) RepInfo object which will be modified to reflect the * results of the parsing If multiple calls to <code>parse</code> are made on the basis of a * nonzero value being returned, the same RepInfo object should be passed with each call. * @param parseIndex Must be 0 in first call to <code>parse</code>. If <code>parse</code> returns * a nonzero value, it must be called again with <code>parseIndex</code> equal to that return * value. */ public int parse(InputStream stream, RepInfo info, int parseIndex) throws IOException { if (parseIndex != 0) { // Coming in with parseIndex = 1 indicates that we've determined // this is XHTML; so we invoke the XML module to parse it. // If parseIndex is 100, this is the first invocation of the // XML module, so we call it with 0; otherwise we call it with // the value of parseIndex. if (isXmlAvailable()) { edu.harvard.hul.ois.jhove.module.XmlModule xmlMod = new edu.harvard.hul.ois.jhove.module.XmlModule(); if (parseIndex == 100) { parseIndex = 0; } xmlMod.setApp(_app); xmlMod.setBase(_je); xmlMod.setDefaultParams(_defaultParams); try { xmlMod.applyDefaultParams(); } catch (Exception e) { // really shouldn't happen } xmlMod.setXhtmlDoctype(_doctype); return xmlMod.parse(stream, info, parseIndex); } else { // The XML module shouldn't be missing from any installation, // but someone who really wanted to could remove it. In // that case, you deserve what you get. info.setMessage(new ErrorMessage("XML-HUL module required to validate XHTML documents")); info.setWellFormed(false); // Treat it as completely wrong return 0; } } else { /* parseIndex = 0, first call only */ _doctype = null; } // Test if textMD is to be generated if (_defaultParams != null) { Iterator iter = _defaultParams.iterator(); while (iter.hasNext()) { String param = (String) iter.next(); if (param.toLowerCase().equals("withtextmd=true")) { _withTextMD = true; } } } initParse(); info.setFormat(_format[0]); info.setMimeType(_mimeType[0]); info.setModule(this); if (_textMD == null || parseIndex == 0) { _textMD = new TextMDMetadata(); } /* We may have already done the checksums while converting a temporary file. */ Checksummer ckSummer = null; if (_je != null && _je.getChecksumFlag() && info.getChecksum().size() == 0) { ckSummer = new Checksummer(); _cstream = new ChecksumInputStream(stream, ckSummer); _dstream = getBufferedDataStream(_cstream, _je != null ? _je.getBufferSize() : 0); } else { _dstream = getBufferedDataStream(stream, _je != null ? _je.getBufferSize() : 0); } ParseHtml parser = null; HtmlMetadata metadata = null; HtmlCharStream cstream = null; try { cstream = new HtmlCharStream(_dstream, "ISO-8859-1"); parser = new ParseHtml(cstream); } catch (UnsupportedEncodingException e) { info.setMessage(new ErrorMessage("Internal error: " + e.getMessage())); info.setWellFormed(false); return 0; // shouldn't happen! } int type = 0; try { List elements = parser.HtmlDoc(); if (elements.isEmpty()) { // Consider an empty document bad info.setWellFormed(false); info.setMessage(new ErrorMessage("Document is empty")); return 0; } type = checkDoctype(elements); if (type < 0) { info.setWellFormed(false); info.setMessage(new ErrorMessage("DOCTYPE is not HTML")); return 0; } /* Check if there is at least one html, head, body or title tag. * A plain text document * might be interpreted as a single PCDATA, which is in some * ethereal sense well-formed HTML, but it's pointless to consider * it such. It might also use angle brackets as a text delimiter, * and that shouldn't count as HTML either. */ boolean hasElements = false; Iterator iter = elements.iterator(); while (iter.hasNext()) { Object o = iter.next(); if (o instanceof JHOpenTag) { String name = ((JHOpenTag) o).getName(); if ("html".equals(name) || "head".equals(name) || "body".equals(name) || "title".equals(name)) { hasElements = true; } break; } } if (!hasElements) { info.setMessage(new ErrorMessage("Document contains no html, head, body or title tags")); info.setWellFormed(false); return 0; } // CRLF from HtmlCharStream ... String lineEnd = cstream.getKindOfLineEnd(); if (lineEnd == null) { info.setMessage(new InfoMessage("Not able to determine type of end of line")); _textMD.setLinebreak(TextMDMetadata.NILL); } else if (lineEnd.equalsIgnoreCase("CR")) { _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CR); } else if (lineEnd.equalsIgnoreCase("LF")) { _textMD.setLinebreak(TextMDMetadata.LINEBREAK_LF); } else if (lineEnd.equalsIgnoreCase("CRLF")) { _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CRLF); } if (type == 0) { /* If we can't find a doctype, it still might be XHTML * if the elements start with an XML declaration and * the root element is "html" */ switch (seemsToBeXHTML(elements)) { case 0: // Not XML break; // fall through case 1: // XML but not HTML info.setMessage( new ErrorMessage( "Document has XML declaration but no DOCTYPE; " + "probably XML rather than HTML")); info.setWellFormed(false); return 0; case 2: // probably XHTML return 100; } info.setMessage( new ErrorMessage( "Unrecognized or missing DOCTYPE declaration; " + "validation continuing as HTML 3.2")); info.setValid(false); // But keep going } HtmlDocDesc docDesc = null; switch (type) { case HTML_3_2: default: docDesc = new Html3_2DocDesc(); _textMD.setMarkup_basis("HTML"); _textMD.setMarkup_basis_version("3.2"); break; case HTML_4_0_FRAMESET: docDesc = new Html4_0FrameDocDesc(); _textMD.setMarkup_basis("HTML"); _textMD.setMarkup_basis_version("4.0"); break; case HTML_4_0_TRANSITIONAL: docDesc = new Html4_0TransDocDesc(); _textMD.setMarkup_basis("HTML"); _textMD.setMarkup_basis_version("4.0"); break; case HTML_4_0_STRICT: docDesc = new Html4_0StrictDocDesc(); _textMD.setMarkup_basis("HTML"); _textMD.setMarkup_basis_version("4.0"); break; case HTML_4_01_FRAMESET: docDesc = new Html4_01FrameDocDesc(); _textMD.setMarkup_basis("HTML"); _textMD.setMarkup_basis_version("4.01"); break; case HTML_4_01_TRANSITIONAL: docDesc = new Html4_01TransDocDesc(); _textMD.setMarkup_basis("HTML"); _textMD.setMarkup_basis_version("4.01"); break; case HTML_4_01_STRICT: docDesc = new Html4_01StrictDocDesc(); _textMD.setMarkup_basis("HTML"); _textMD.setMarkup_basis_version("4.01"); break; case XHTML_1_0_STRICT: case XHTML_1_0_TRANSITIONAL: case XHTML_1_0_FRAMESET: case XHTML_1_1: // Force a second call to parse as XML. 100 is a // magic code for the first XML call. return 100; } _textMD.setMarkup_language(_doctype); if (docDesc == null) { info.setMessage( new InfoMessage( "Code for appropriate HTML version not available yet:" + "substituting HTML 3.2")); docDesc = new Html3_2DocDesc(); } docDesc.validate(elements, info); metadata = docDesc.getMetadata(); // Try to get the charset from the meta Content if (metadata.getCharset() != null) { _textMD.setCharset(metadata.getCharset()); } else { _textMD.setCharset(TextMDMetadata.CHARSET_ISO8859_1); } String textMDEncoding = _textMD.getCharset(); if (textMDEncoding.indexOf("UTF") != -1) { _textMD.setByte_order( _bigEndian ? TextMDMetadata.BYTE_ORDER_BIG : TextMDMetadata.BYTE_ORDER_LITTLE); _textMD.setByte_size("8"); _textMD.setCharacter_size("variable"); } else { _textMD.setByte_order( _bigEndian ? TextMDMetadata.BYTE_ORDER_BIG : TextMDMetadata.BYTE_ORDER_LITTLE); _textMD.setByte_size("8"); _textMD.setCharacter_size("1"); } } catch (ParseException e) { Token t = e.currentToken; info.setMessage( new ErrorMessage("Parse error", "Line = " + t.beginLine + ", column = " + t.beginColumn)); info.setWellFormed(false); } catch (TokenMgrError f) { info.setMessage(new ErrorMessage("TokenMgrError: " + f.getLocalizedMessage())); info.setWellFormed(false); } if (info.getWellFormed() == RepInfo.FALSE) { return 0; } if (type != 0) { if (profileNames[type] != null) { info.setProfile(profileNames[type]); } info.setVersion(versionNames[type]); } if (metadata != null) { Property property = metadata.toProperty(_withTextMD ? _textMD : null); if (property != null) { info.setProperty(property); } } if (ckSummer != null) { info.setSize(_cstream.getNBytes()); info.setChecksum(new Checksum(ckSummer.getCRC32(), ChecksumType.CRC32)); String value = ckSummer.getMD5(); if (value != null) { info.setChecksum(new Checksum(value, ChecksumType.MD5)); } if ((value = ckSummer.getSHA1()) != null) { info.setChecksum(new Checksum(value, ChecksumType.SHA1)); } } return 0; }
/** * Parses the content of a purported WAVE digital object and stores the results in RepInfo. * * @param stream An InputStream, positioned at its beginning, which is generated from the object * to be parsed * @param info A fresh RepInfo object which will be modified to reflect the results of the parsing * @param parseIndex Must be 0 in first call to <code>parse</code>. If <code>parse</code> returns * a nonzero value, it must be called again with <code>parseIndex</code> equal to that return * value. */ public int parse(InputStream stream, RepInfo info, int parseIndex) throws IOException { initParse(); info.setFormat(_format[0]); info.setMimeType(_mimeType[0]); info.setModule(this); _aesMetadata.setPrimaryIdentifier(info.getUri()); if (info.getURLFlag()) { _aesMetadata.setOtherPrimaryIdentifierType("URI"); } else { _aesMetadata.setPrimaryIdentifierType(AESAudioMetadata.FILE_NAME); } /* We may have already done the checksums while converting a temporary file. */ _ckSummer = null; if (_je != null && _je.getChecksumFlag() && info.getChecksum().size() == 0) { _ckSummer = new Checksummer(); _cstream = new ChecksumInputStream(stream, _ckSummer); _dstream = getBufferedDataStream(_cstream, _je != null ? _je.getBufferSize() : 0); } else { _dstream = getBufferedDataStream(stream, _je != null ? _je.getBufferSize() : 0); } try { // Check the start of the file for the right opening bytes for (int i = 0; i < 4; i++) { int ch = readUnsignedByte(_dstream, this); if (ch != sigByte[i]) { info.setMessage(new ErrorMessage("Document does not start with RIFF chunk", 0)); info.setWellFormed(false); return 0; } } /* If we got this far, take note that the signature is OK. */ info.setSigMatch(_name); // Get the length of the Form chunk. This includes all // the subsequent chunks in the file, but excludes the // header ("FORM" and the length itself). bytesRemaining = readUnsignedInt(_dstream); // Read the file type. String typ = read4Chars(_dstream); bytesRemaining -= 4; if (!"WAVE".equals(typ)) { info.setMessage(new ErrorMessage("File type in RIFF header is not WAVE", _nByte)); info.setWellFormed(false); return 0; } while (bytesRemaining > 0) { if (!readChunk(info)) { break; } } } catch (EOFException e) { info.setWellFormed(false); info.setMessage(new ErrorMessage("Unexpected end of file", _nByte)); return 0; } // Set duration from number of samples and rate. if (numSamples > 0) { // _aesMetadata.setDuration((double) numSamples / sampleRate); _aesMetadata.setDuration(numSamples); } // Add note and label properties, if there's anything // to report. if (!_labels.isEmpty()) { _propList.add(new Property("Labels", PropertyType.PROPERTY, PropertyArity.LIST, _labels)); } if (!_labeledText.isEmpty()) { _propList.add( new Property("LabeledText", PropertyType.PROPERTY, PropertyArity.LIST, _labeledText)); } if (!_notes.isEmpty()) { _propList.add(new Property("Notes", PropertyType.PROPERTY, PropertyArity.LIST, _notes)); } if (!_samples.isEmpty()) { _propList.add(new Property("Samples", PropertyType.PROPERTY, PropertyArity.LIST, _samples)); } if (_exifInfo != null) { _propList.add(_exifInfo.buildProperty()); } if (!formatChunkSeen) { info.setMessage(new ErrorMessage("No Format Chunk")); info.setWellFormed(false); return 0; } /* This file looks OK. */ if (_ckSummer != null) { /* We may not have actually hit the end of file. If we're calculating * checksums on the fly, we have to read and discard whatever is * left, so it will get checksummed. */ for (; ; ) { try { int n = skipBytes(_dstream, 2048, this); if (n == 0) { break; } } catch (Exception e) { break; } } info.setSize(_cstream.getNBytes()); info.setChecksum(new Checksum(_ckSummer.getCRC32(), ChecksumType.CRC32)); String value = _ckSummer.getMD5(); if (value != null) { info.setChecksum(new Checksum(value, ChecksumType.MD5)); } if ((value = _ckSummer.getSHA1()) != null) { info.setChecksum(new Checksum(value, ChecksumType.SHA1)); } } info.setProperty(_metadata); // Indicate satisfied profiles. if (flagPCMWaveFormat) { info.setProfile("PCMWAVEFORMAT"); } if (flagWaveFormatEx) { info.setProfile("WAVEFORMATEX"); } if (flagWaveFormatExtensible) { info.setProfile("WAVEFORMATEXTENSIBLE"); } if (flagBroadcastWave) { // Need to do some additional checks. if (!broadcastExtChunkSeen) { flagBroadcastWave = false; } if (compressionCode == FormatChunk.WAVE_FORMAT_MPEG) { if (!broadcastExtChunkSeen || !factChunkSeen) { flagBroadcastWave = false; } } if (flagBroadcastWave) { String prof = null; switch (broadcastVersion) { case 0: prof = "Broadcast Wave Version 0"; break; case 1: prof = "Broadcast Wave Version 1"; break; // Other versions are unknown at this time } if (prof != null) { info.setProfile(prof); } } } return 0; }