/**
   * Parse the content of a purported HTML stream digital object and store the results in RepInfo.
   *
   * @param stream An InputStream, positioned at its beginning, which is generated from the object
   *     to be parsed. If multiple calls to <code>parse</code> are made on the basis of a nonzero
   *     value being returned, a new InputStream must be provided each time.
   * @param info A fresh (on the first call) RepInfo object which will be modified to reflect the
   *     results of the parsing If multiple calls to <code>parse</code> are made on the basis of a
   *     nonzero value being returned, the same RepInfo object should be passed with each call.
   * @param parseIndex Must be 0 in first call to <code>parse</code>. If <code>parse</code> returns
   *     a nonzero value, it must be called again with <code>parseIndex</code> equal to that return
   *     value.
   */
  public int parse(InputStream stream, RepInfo info, int parseIndex) throws IOException {
    if (parseIndex != 0) {
      // Coming in with parseIndex = 1 indicates that we've determined
      // this is XHTML; so we invoke the XML module to parse it.
      // If parseIndex is 100, this is the first invocation of the
      // XML module, so we call it with 0; otherwise we call it with
      // the value of parseIndex.
      if (isXmlAvailable()) {
        edu.harvard.hul.ois.jhove.module.XmlModule xmlMod =
            new edu.harvard.hul.ois.jhove.module.XmlModule();
        if (parseIndex == 100) {
          parseIndex = 0;
        }
        xmlMod.setApp(_app);
        xmlMod.setBase(_je);
        xmlMod.setDefaultParams(_defaultParams);
        try {
          xmlMod.applyDefaultParams();
        } catch (Exception e) {
          // really shouldn't happen
        }
        xmlMod.setXhtmlDoctype(_doctype);
        return xmlMod.parse(stream, info, parseIndex);
      } else {
        // The XML module shouldn't be missing from any installation,
        // but someone who really wanted to could remove it.  In
        // that case, you deserve what you get.
        info.setMessage(new ErrorMessage("XML-HUL module required to validate XHTML documents"));
        info.setWellFormed(false); // Treat it as completely wrong
        return 0;
      }
    } else {
      /* parseIndex = 0, first call only */
      _doctype = null;
    }
    // Test if textMD is to be generated
    if (_defaultParams != null) {
      Iterator iter = _defaultParams.iterator();
      while (iter.hasNext()) {
        String param = (String) iter.next();
        if (param.toLowerCase().equals("withtextmd=true")) {
          _withTextMD = true;
        }
      }
    }

    initParse();
    info.setFormat(_format[0]);
    info.setMimeType(_mimeType[0]);
    info.setModule(this);

    if (_textMD == null || parseIndex == 0) {
      _textMD = new TextMDMetadata();
    }
    /* We may have already done the checksums while converting a
    temporary file. */
    Checksummer ckSummer = null;
    if (_je != null && _je.getChecksumFlag() && info.getChecksum().size() == 0) {
      ckSummer = new Checksummer();
      _cstream = new ChecksumInputStream(stream, ckSummer);
      _dstream = getBufferedDataStream(_cstream, _je != null ? _je.getBufferSize() : 0);
    } else {
      _dstream = getBufferedDataStream(stream, _je != null ? _je.getBufferSize() : 0);
    }

    ParseHtml parser = null;
    HtmlMetadata metadata = null;
    HtmlCharStream cstream = null;
    try {
      cstream = new HtmlCharStream(_dstream, "ISO-8859-1");
      parser = new ParseHtml(cstream);
    } catch (UnsupportedEncodingException e) {
      info.setMessage(new ErrorMessage("Internal error: " + e.getMessage()));
      info.setWellFormed(false);
      return 0; // shouldn't happen!
    }
    int type = 0;
    try {
      List elements = parser.HtmlDoc();
      if (elements.isEmpty()) {
        // Consider an empty document bad
        info.setWellFormed(false);
        info.setMessage(new ErrorMessage("Document is empty"));
        return 0;
      }
      type = checkDoctype(elements);
      if (type < 0) {
        info.setWellFormed(false);
        info.setMessage(new ErrorMessage("DOCTYPE is not HTML"));
        return 0;
      }
      /* Check if there is at least one html, head, body or title tag.
       * A plain text document
       * might be interpreted as a single PCDATA, which is in some
       * ethereal sense well-formed HTML, but it's pointless to consider
       * it such.  It might also use angle brackets as a text delimiter,
       * and that shouldn't count as HTML either. */
      boolean hasElements = false;
      Iterator iter = elements.iterator();
      while (iter.hasNext()) {
        Object o = iter.next();
        if (o instanceof JHOpenTag) {
          String name = ((JHOpenTag) o).getName();
          if ("html".equals(name)
              || "head".equals(name)
              || "body".equals(name)
              || "title".equals(name)) {
            hasElements = true;
          }
          break;
        }
      }
      if (!hasElements) {
        info.setMessage(new ErrorMessage("Document contains no html, head, body or title tags"));
        info.setWellFormed(false);
        return 0;
      }

      // CRLF from HtmlCharStream ...
      String lineEnd = cstream.getKindOfLineEnd();
      if (lineEnd == null) {
        info.setMessage(new InfoMessage("Not able to determine type of end of line"));
        _textMD.setLinebreak(TextMDMetadata.NILL);
      } else if (lineEnd.equalsIgnoreCase("CR")) {
        _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CR);
      } else if (lineEnd.equalsIgnoreCase("LF")) {
        _textMD.setLinebreak(TextMDMetadata.LINEBREAK_LF);
      } else if (lineEnd.equalsIgnoreCase("CRLF")) {
        _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CRLF);
      }

      if (type == 0) {
        /* If we can't find a doctype, it still might be XHTML
         * if the elements start with an XML declaration and
         * the root element is "html" */
        switch (seemsToBeXHTML(elements)) {
          case 0: // Not XML
            break; // fall through
          case 1: // XML but not HTML
            info.setMessage(
                new ErrorMessage(
                    "Document has XML declaration but no DOCTYPE; "
                        + "probably XML rather than HTML"));
            info.setWellFormed(false);
            return 0;
          case 2: // probably XHTML
            return 100;
        }
        info.setMessage(
            new ErrorMessage(
                "Unrecognized or missing DOCTYPE declaration; "
                    + "validation continuing as HTML 3.2"));
        info.setValid(false);
        // But keep going
      }

      HtmlDocDesc docDesc = null;
      switch (type) {
        case HTML_3_2:
        default:
          docDesc = new Html3_2DocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("3.2");
          break;

        case HTML_4_0_FRAMESET:
          docDesc = new Html4_0FrameDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.0");
          break;
        case HTML_4_0_TRANSITIONAL:
          docDesc = new Html4_0TransDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.0");
          break;
        case HTML_4_0_STRICT:
          docDesc = new Html4_0StrictDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.0");
          break;
        case HTML_4_01_FRAMESET:
          docDesc = new Html4_01FrameDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.01");
          break;
        case HTML_4_01_TRANSITIONAL:
          docDesc = new Html4_01TransDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.01");
          break;
        case HTML_4_01_STRICT:
          docDesc = new Html4_01StrictDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.01");
          break;
        case XHTML_1_0_STRICT:
        case XHTML_1_0_TRANSITIONAL:
        case XHTML_1_0_FRAMESET:
        case XHTML_1_1:
          // Force a second call to parse as XML. 100 is a
          // magic code for the first XML call.
          return 100;
      }
      _textMD.setMarkup_language(_doctype);
      if (docDesc == null) {
        info.setMessage(
            new InfoMessage(
                "Code for appropriate HTML version not available yet:" + "substituting HTML 3.2"));
        docDesc = new Html3_2DocDesc();
      }
      docDesc.validate(elements, info);
      metadata = docDesc.getMetadata();

      // Try to get the charset from the meta Content
      if (metadata.getCharset() != null) {
        _textMD.setCharset(metadata.getCharset());
      } else {
        _textMD.setCharset(TextMDMetadata.CHARSET_ISO8859_1);
      }
      String textMDEncoding = _textMD.getCharset();
      if (textMDEncoding.indexOf("UTF") != -1) {
        _textMD.setByte_order(
            _bigEndian ? TextMDMetadata.BYTE_ORDER_BIG : TextMDMetadata.BYTE_ORDER_LITTLE);
        _textMD.setByte_size("8");
        _textMD.setCharacter_size("variable");
      } else {
        _textMD.setByte_order(
            _bigEndian ? TextMDMetadata.BYTE_ORDER_BIG : TextMDMetadata.BYTE_ORDER_LITTLE);
        _textMD.setByte_size("8");
        _textMD.setCharacter_size("1");
      }
    } catch (ParseException e) {
      Token t = e.currentToken;
      info.setMessage(
          new ErrorMessage("Parse error", "Line = " + t.beginLine + ", column = " + t.beginColumn));
      info.setWellFormed(false);
    } catch (TokenMgrError f) {
      info.setMessage(new ErrorMessage("TokenMgrError: " + f.getLocalizedMessage()));
      info.setWellFormed(false);
    }

    if (info.getWellFormed() == RepInfo.FALSE) {
      return 0;
    }

    if (type != 0) {
      if (profileNames[type] != null) {
        info.setProfile(profileNames[type]);
      }
      info.setVersion(versionNames[type]);
    }

    if (metadata != null) {
      Property property = metadata.toProperty(_withTextMD ? _textMD : null);
      if (property != null) {
        info.setProperty(property);
      }
    }

    if (ckSummer != null) {
      info.setSize(_cstream.getNBytes());
      info.setChecksum(new Checksum(ckSummer.getCRC32(), ChecksumType.CRC32));
      String value = ckSummer.getMD5();
      if (value != null) {
        info.setChecksum(new Checksum(value, ChecksumType.MD5));
      }
      if ((value = ckSummer.getSHA1()) != null) {
        info.setChecksum(new Checksum(value, ChecksumType.SHA1));
      }
    }

    return 0;
  }
Example #2
0
  /**
   * Parses the content of a purported WAVE digital object and stores the results in RepInfo.
   *
   * @param stream An InputStream, positioned at its beginning, which is generated from the object
   *     to be parsed
   * @param info A fresh RepInfo object which will be modified to reflect the results of the parsing
   * @param parseIndex Must be 0 in first call to <code>parse</code>. If <code>parse</code> returns
   *     a nonzero value, it must be called again with <code>parseIndex</code> equal to that return
   *     value.
   */
  public int parse(InputStream stream, RepInfo info, int parseIndex) throws IOException {
    initParse();
    info.setFormat(_format[0]);
    info.setMimeType(_mimeType[0]);
    info.setModule(this);

    _aesMetadata.setPrimaryIdentifier(info.getUri());
    if (info.getURLFlag()) {
      _aesMetadata.setOtherPrimaryIdentifierType("URI");
    } else {
      _aesMetadata.setPrimaryIdentifierType(AESAudioMetadata.FILE_NAME);
    }

    /* We may have already done the checksums while converting a
    temporary file. */
    _ckSummer = null;
    if (_je != null && _je.getChecksumFlag() && info.getChecksum().size() == 0) {
      _ckSummer = new Checksummer();
      _cstream = new ChecksumInputStream(stream, _ckSummer);
      _dstream = getBufferedDataStream(_cstream, _je != null ? _je.getBufferSize() : 0);
    } else {
      _dstream = getBufferedDataStream(stream, _je != null ? _je.getBufferSize() : 0);
    }

    try {
      // Check the start of the file for the right opening bytes
      for (int i = 0; i < 4; i++) {
        int ch = readUnsignedByte(_dstream, this);
        if (ch != sigByte[i]) {
          info.setMessage(new ErrorMessage("Document does not start with RIFF chunk", 0));
          info.setWellFormed(false);
          return 0;
        }
      }
      /* If we got this far, take note that the signature is OK. */
      info.setSigMatch(_name);

      // Get the length of the Form chunk.  This includes all
      // the subsequent chunks in the file, but excludes the
      // header ("FORM" and the length itself).
      bytesRemaining = readUnsignedInt(_dstream);

      // Read the file type.
      String typ = read4Chars(_dstream);
      bytesRemaining -= 4;
      if (!"WAVE".equals(typ)) {
        info.setMessage(new ErrorMessage("File type in RIFF header is not WAVE", _nByte));
        info.setWellFormed(false);
        return 0;
      }

      while (bytesRemaining > 0) {
        if (!readChunk(info)) {
          break;
        }
      }
    } catch (EOFException e) {
      info.setWellFormed(false);
      info.setMessage(new ErrorMessage("Unexpected end of file", _nByte));
      return 0;
    }

    // Set duration from number of samples and rate.
    if (numSamples > 0) {
      // _aesMetadata.setDuration((double) numSamples / sampleRate);
      _aesMetadata.setDuration(numSamples);
    }

    // Add note and label properties, if there's anything
    // to report.
    if (!_labels.isEmpty()) {
      _propList.add(new Property("Labels", PropertyType.PROPERTY, PropertyArity.LIST, _labels));
    }
    if (!_labeledText.isEmpty()) {
      _propList.add(
          new Property("LabeledText", PropertyType.PROPERTY, PropertyArity.LIST, _labeledText));
    }
    if (!_notes.isEmpty()) {
      _propList.add(new Property("Notes", PropertyType.PROPERTY, PropertyArity.LIST, _notes));
    }
    if (!_samples.isEmpty()) {
      _propList.add(new Property("Samples", PropertyType.PROPERTY, PropertyArity.LIST, _samples));
    }
    if (_exifInfo != null) {
      _propList.add(_exifInfo.buildProperty());
    }
    if (!formatChunkSeen) {
      info.setMessage(new ErrorMessage("No Format Chunk"));
      info.setWellFormed(false);
      return 0;
    }

    /* This file looks OK. */
    if (_ckSummer != null) {
      /* We may not have actually hit the end of file. If we're calculating
       * checksums on the fly, we have to read and discard whatever is
       * left, so it will get checksummed. */
      for (; ; ) {
        try {
          int n = skipBytes(_dstream, 2048, this);
          if (n == 0) {
            break;
          }
        } catch (Exception e) {
          break;
        }
      }
      info.setSize(_cstream.getNBytes());
      info.setChecksum(new Checksum(_ckSummer.getCRC32(), ChecksumType.CRC32));
      String value = _ckSummer.getMD5();
      if (value != null) {
        info.setChecksum(new Checksum(value, ChecksumType.MD5));
      }
      if ((value = _ckSummer.getSHA1()) != null) {
        info.setChecksum(new Checksum(value, ChecksumType.SHA1));
      }
    }

    info.setProperty(_metadata);

    // Indicate satisfied profiles.
    if (flagPCMWaveFormat) {
      info.setProfile("PCMWAVEFORMAT");
    }
    if (flagWaveFormatEx) {
      info.setProfile("WAVEFORMATEX");
    }
    if (flagWaveFormatExtensible) {
      info.setProfile("WAVEFORMATEXTENSIBLE");
    }
    if (flagBroadcastWave) {
      // Need to do some additional checks.
      if (!broadcastExtChunkSeen) {
        flagBroadcastWave = false;
      }
      if (compressionCode == FormatChunk.WAVE_FORMAT_MPEG) {
        if (!broadcastExtChunkSeen || !factChunkSeen) {
          flagBroadcastWave = false;
        }
      }
      if (flagBroadcastWave) {
        String prof = null;
        switch (broadcastVersion) {
          case 0:
            prof = "Broadcast Wave Version 0";
            break;

          case 1:
            prof = "Broadcast Wave Version 1";
            break;

            // Other versions are unknown at this time
        }
        if (prof != null) {
          info.setProfile(prof);
        }
      }
    }
    return 0;
  }