示例#1
0
 /**
  * Parse the IFD.
  *
  * @param byteOffsetIsValid If true, allow offsets on odd byte boundaries
  * @param suppressErrors If true, return IFD even with errors
  * @return The offset of the next IFD
  */
 public long parse(boolean byteOffsetIsValid, boolean suppressErrors) throws TiffException {
   try {
     return parse(byteOffsetIsValid);
   } catch (TiffException e) {
     // If we got a TiffException and we're suppressing errors,
     // cover over the exception and issue an info message;
     // but we can't follow the IFD chain further.
     if (suppressErrors) {
       _info.setMessage(new InfoMessage(e.getMessage(), e.getOffset()));
       return 0;
     } else throw e;
   }
 }
  /**
   * Parse the content of a purported HTML stream digital object and store the results in RepInfo.
   *
   * @param stream An InputStream, positioned at its beginning, which is generated from the object
   *     to be parsed. If multiple calls to <code>parse</code> are made on the basis of a nonzero
   *     value being returned, a new InputStream must be provided each time.
   * @param info A fresh (on the first call) RepInfo object which will be modified to reflect the
   *     results of the parsing If multiple calls to <code>parse</code> are made on the basis of a
   *     nonzero value being returned, the same RepInfo object should be passed with each call.
   * @param parseIndex Must be 0 in first call to <code>parse</code>. If <code>parse</code> returns
   *     a nonzero value, it must be called again with <code>parseIndex</code> equal to that return
   *     value.
   */
  public int parse(InputStream stream, RepInfo info, int parseIndex) throws IOException {
    if (parseIndex != 0) {
      // Coming in with parseIndex = 1 indicates that we've determined
      // this is XHTML; so we invoke the XML module to parse it.
      // If parseIndex is 100, this is the first invocation of the
      // XML module, so we call it with 0; otherwise we call it with
      // the value of parseIndex.
      if (isXmlAvailable()) {
        edu.harvard.hul.ois.jhove.module.XmlModule xmlMod =
            new edu.harvard.hul.ois.jhove.module.XmlModule();
        if (parseIndex == 100) {
          parseIndex = 0;
        }
        xmlMod.setApp(_app);
        xmlMod.setBase(_je);
        xmlMod.setDefaultParams(_defaultParams);
        try {
          xmlMod.applyDefaultParams();
        } catch (Exception e) {
          // really shouldn't happen
        }
        xmlMod.setXhtmlDoctype(_doctype);
        return xmlMod.parse(stream, info, parseIndex);
      } else {
        // The XML module shouldn't be missing from any installation,
        // but someone who really wanted to could remove it.  In
        // that case, you deserve what you get.
        info.setMessage(new ErrorMessage("XML-HUL module required to validate XHTML documents"));
        info.setWellFormed(false); // Treat it as completely wrong
        return 0;
      }
    } else {
      /* parseIndex = 0, first call only */
      _doctype = null;
    }
    // Test if textMD is to be generated
    if (_defaultParams != null) {
      Iterator iter = _defaultParams.iterator();
      while (iter.hasNext()) {
        String param = (String) iter.next();
        if (param.toLowerCase().equals("withtextmd=true")) {
          _withTextMD = true;
        }
      }
    }

    initParse();
    info.setFormat(_format[0]);
    info.setMimeType(_mimeType[0]);
    info.setModule(this);

    if (_textMD == null || parseIndex == 0) {
      _textMD = new TextMDMetadata();
    }
    /* We may have already done the checksums while converting a
    temporary file. */
    Checksummer ckSummer = null;
    if (_je != null && _je.getChecksumFlag() && info.getChecksum().size() == 0) {
      ckSummer = new Checksummer();
      _cstream = new ChecksumInputStream(stream, ckSummer);
      _dstream = getBufferedDataStream(_cstream, _je != null ? _je.getBufferSize() : 0);
    } else {
      _dstream = getBufferedDataStream(stream, _je != null ? _je.getBufferSize() : 0);
    }

    ParseHtml parser = null;
    HtmlMetadata metadata = null;
    HtmlCharStream cstream = null;
    try {
      cstream = new HtmlCharStream(_dstream, "ISO-8859-1");
      parser = new ParseHtml(cstream);
    } catch (UnsupportedEncodingException e) {
      info.setMessage(new ErrorMessage("Internal error: " + e.getMessage()));
      info.setWellFormed(false);
      return 0; // shouldn't happen!
    }
    int type = 0;
    try {
      List elements = parser.HtmlDoc();
      if (elements.isEmpty()) {
        // Consider an empty document bad
        info.setWellFormed(false);
        info.setMessage(new ErrorMessage("Document is empty"));
        return 0;
      }
      type = checkDoctype(elements);
      if (type < 0) {
        info.setWellFormed(false);
        info.setMessage(new ErrorMessage("DOCTYPE is not HTML"));
        return 0;
      }
      /* Check if there is at least one html, head, body or title tag.
       * A plain text document
       * might be interpreted as a single PCDATA, which is in some
       * ethereal sense well-formed HTML, but it's pointless to consider
       * it such.  It might also use angle brackets as a text delimiter,
       * and that shouldn't count as HTML either. */
      boolean hasElements = false;
      Iterator iter = elements.iterator();
      while (iter.hasNext()) {
        Object o = iter.next();
        if (o instanceof JHOpenTag) {
          String name = ((JHOpenTag) o).getName();
          if ("html".equals(name)
              || "head".equals(name)
              || "body".equals(name)
              || "title".equals(name)) {
            hasElements = true;
          }
          break;
        }
      }
      if (!hasElements) {
        info.setMessage(new ErrorMessage("Document contains no html, head, body or title tags"));
        info.setWellFormed(false);
        return 0;
      }

      // CRLF from HtmlCharStream ...
      String lineEnd = cstream.getKindOfLineEnd();
      if (lineEnd == null) {
        info.setMessage(new InfoMessage("Not able to determine type of end of line"));
        _textMD.setLinebreak(TextMDMetadata.NILL);
      } else if (lineEnd.equalsIgnoreCase("CR")) {
        _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CR);
      } else if (lineEnd.equalsIgnoreCase("LF")) {
        _textMD.setLinebreak(TextMDMetadata.LINEBREAK_LF);
      } else if (lineEnd.equalsIgnoreCase("CRLF")) {
        _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CRLF);
      }

      if (type == 0) {
        /* If we can't find a doctype, it still might be XHTML
         * if the elements start with an XML declaration and
         * the root element is "html" */
        switch (seemsToBeXHTML(elements)) {
          case 0: // Not XML
            break; // fall through
          case 1: // XML but not HTML
            info.setMessage(
                new ErrorMessage(
                    "Document has XML declaration but no DOCTYPE; "
                        + "probably XML rather than HTML"));
            info.setWellFormed(false);
            return 0;
          case 2: // probably XHTML
            return 100;
        }
        info.setMessage(
            new ErrorMessage(
                "Unrecognized or missing DOCTYPE declaration; "
                    + "validation continuing as HTML 3.2"));
        info.setValid(false);
        // But keep going
      }

      HtmlDocDesc docDesc = null;
      switch (type) {
        case HTML_3_2:
        default:
          docDesc = new Html3_2DocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("3.2");
          break;

        case HTML_4_0_FRAMESET:
          docDesc = new Html4_0FrameDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.0");
          break;
        case HTML_4_0_TRANSITIONAL:
          docDesc = new Html4_0TransDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.0");
          break;
        case HTML_4_0_STRICT:
          docDesc = new Html4_0StrictDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.0");
          break;
        case HTML_4_01_FRAMESET:
          docDesc = new Html4_01FrameDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.01");
          break;
        case HTML_4_01_TRANSITIONAL:
          docDesc = new Html4_01TransDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.01");
          break;
        case HTML_4_01_STRICT:
          docDesc = new Html4_01StrictDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.01");
          break;
        case XHTML_1_0_STRICT:
        case XHTML_1_0_TRANSITIONAL:
        case XHTML_1_0_FRAMESET:
        case XHTML_1_1:
          // Force a second call to parse as XML. 100 is a
          // magic code for the first XML call.
          return 100;
      }
      _textMD.setMarkup_language(_doctype);
      if (docDesc == null) {
        info.setMessage(
            new InfoMessage(
                "Code for appropriate HTML version not available yet:" + "substituting HTML 3.2"));
        docDesc = new Html3_2DocDesc();
      }
      docDesc.validate(elements, info);
      metadata = docDesc.getMetadata();

      // Try to get the charset from the meta Content
      if (metadata.getCharset() != null) {
        _textMD.setCharset(metadata.getCharset());
      } else {
        _textMD.setCharset(TextMDMetadata.CHARSET_ISO8859_1);
      }
      String textMDEncoding = _textMD.getCharset();
      if (textMDEncoding.indexOf("UTF") != -1) {
        _textMD.setByte_order(
            _bigEndian ? TextMDMetadata.BYTE_ORDER_BIG : TextMDMetadata.BYTE_ORDER_LITTLE);
        _textMD.setByte_size("8");
        _textMD.setCharacter_size("variable");
      } else {
        _textMD.setByte_order(
            _bigEndian ? TextMDMetadata.BYTE_ORDER_BIG : TextMDMetadata.BYTE_ORDER_LITTLE);
        _textMD.setByte_size("8");
        _textMD.setCharacter_size("1");
      }
    } catch (ParseException e) {
      Token t = e.currentToken;
      info.setMessage(
          new ErrorMessage("Parse error", "Line = " + t.beginLine + ", column = " + t.beginColumn));
      info.setWellFormed(false);
    } catch (TokenMgrError f) {
      info.setMessage(new ErrorMessage("TokenMgrError: " + f.getLocalizedMessage()));
      info.setWellFormed(false);
    }

    if (info.getWellFormed() == RepInfo.FALSE) {
      return 0;
    }

    if (type != 0) {
      if (profileNames[type] != null) {
        info.setProfile(profileNames[type]);
      }
      info.setVersion(versionNames[type]);
    }

    if (metadata != null) {
      Property property = metadata.toProperty(_withTextMD ? _textMD : null);
      if (property != null) {
        info.setProperty(property);
      }
    }

    if (ckSummer != null) {
      info.setSize(_cstream.getNBytes());
      info.setChecksum(new Checksum(ckSummer.getCRC32(), ChecksumType.CRC32));
      String value = ckSummer.getMD5();
      if (value != null) {
        info.setChecksum(new Checksum(value, ChecksumType.MD5));
      }
      if ((value = ckSummer.getSHA1()) != null) {
        info.setChecksum(new Checksum(value, ChecksumType.SHA1));
      }
    }

    return 0;
  }
示例#3
0
  /**
   * Parse the IFD. Errors are not suppressed.
   *
   * @param byteOffsetIsValid If true, allow offsets on odd byte boundaries
   * @return The offset of the next IFD
   */
  public long parse(boolean byteOffsetIsValid) throws TiffException {
    /* Start at the IFD offset, read the number of entries, then
     * read the entire IFD.
     */
    long offset = _offset;
    _next = 0L;
    byte[] buffer;
    int nFields = 0;
    try {
      _raf.seek(offset);
      nFields = ModuleBase.readUnsignedShort(_raf, _bigEndian);
      offset += 2;

      int len = 12 * nFields;
      buffer = new byte[len];
      _raf.read(buffer, 0, len);

      /* Read the offset of the next IFD (or 0 if none). */
      offset += len;
      _next = ModuleBase.readUnsignedInt(_raf, _bigEndian);
    } catch (Exception e) {
      throw new TiffException("Premature EOF", offset);
    }

    DataInputStream ifdStream = new DataInputStream(new ByteArrayInputStream(buffer));

    try {
      int prevTag = 0;
      for (int i = 0; i < nFields; i++) {
        int tag = ModuleBase.readUnsignedShort(ifdStream, _bigEndian, null);
        /* Tags must be in ascending numerical order. */
        if (!debug_allowoutofsequence && tag < prevTag) {
          _info.setMessage(
              new ErrorMessage("Tag " + tag + " out of sequence", _offset + 2 + 12 * i));
          _info.setWellFormed(false);
        }
        prevTag = tag;

        int type = ModuleBase.readUnsignedShort(ifdStream, _bigEndian, null);
        /* Skip over tags with unknown type. */
        if (type < BYTE || type > IFD) {
          _info.setMessage(
              new ErrorMessage(
                  "Unknown data type", "Type = " + type + ", Tag = " + tag, _offset + 4 + 12 * i));
        } else {
          /* Type gives indication of the TIFF version. */
          if (SBYTE <= type && type <= IFD) {
            _version = 6;
          }

          long count = ModuleBase.readUnsignedInt(ifdStream, _bigEndian, null);
          long value = ModuleBase.readUnsignedInt(ifdStream, _bigEndian, null);
          if (calcValueSize(type, count) > 4) {
            /* Value is the word-aligned offset of the actual
             * value. */
            if ((value & 1) != 0) {
              if (byteOffsetIsValid) {
                _info.setMessage(
                    new InfoMessage(
                        "Value offset not word-aligned: " + value, _offset + 10 + 12 * i));
              } else {
                throw new TiffException(
                    "Value offset not " + "word-aligned: " + value, _offset + 10 + 12 * i);
              }
            }
          } else {
            /* Value is the actual value; pass the offset of
             * the value. */
            value = _offset + 10 + 12 * i;
          }
          lookupTag(tag, type, count, value);
        }
      }
    } catch (IOException e) {
      throw new TiffException("Read error", _offset + 2);
    }
    postParseInitialization();

    return _next;
  }
示例#4
0
 /* Factor out the reporting of duplicate chunks. */
 protected void dupChunkError(RepInfo info, String chunkName) {
   info.setMessage(new ErrorMessage("Multiple " + chunkName + " Chunks not permitted", _nByte));
   info.setValid(false);
 }
示例#5
0
  /** Reads a WAVE Chunk. */
  protected boolean readChunk(RepInfo info) throws IOException {
    Chunk chunk = null;
    ChunkHeader chunkh = new ChunkHeader(this, info);
    if (!chunkh.readHeader(_dstream)) {
      return false;
    }
    int chunkSize = (int) chunkh.getSize();
    bytesRemaining -= chunkSize + 8;

    if (bytesRemaining < 0) {
      info.setMessage(new ErrorMessage("Invalid chunk size", _nByte));
      return false;
    }

    String id = chunkh.getID();
    if ("fmt ".equals(id)) {
      if (formatChunkSeen) {
        dupChunkError(info, "Format");
      }
      chunk = new FormatChunk(this, chunkh, _dstream);
      formatChunkSeen = true;
    } else if ("data".equals(id)) {
      if (dataChunkSeen) {
        dupChunkError(info, "Data");
      }
      chunk = new DataChunk(this, chunkh, _dstream);
      dataChunkSeen = true;
    } else if ("fact".equals(id)) {
      chunk = new FactChunk(this, chunkh, _dstream);
      factChunkSeen = true;
      // Are multiple 'fact' chunks allowed?
    } else if ("note".equals(id)) {
      chunk = new NoteChunk(this, chunkh, _dstream);
      // Multiple note chunks are allowed
    } else if ("labl".equals(id)) {
      chunk = new LabelChunk(this, chunkh, _dstream);
      // Multiple label chunks are allowed
    } else if ("list".equals(id)) {
      chunk = new AssocDataListChunk(this, chunkh, _dstream, info);
      // Are multiple chunks allowed?  Who knows?
    } else if ("LIST".equals(id)) {
      chunk = new ListInfoChunk(this, chunkh, _dstream, info);
      // Multiple list chunks must be OK, since there can
      // be different types, e.g., an INFO list and an exif list.
    } else if ("smpl".equals(id)) {
      chunk = new SampleChunk(this, chunkh, _dstream);
      // Multiple sample chunks are allowed -- I think
    } else if ("inst".equals(id)) {
      if (instrumentChunkSeen) {
        dupChunkError(info, "Instrument");
      }
      chunk = new InstrumentChunk(this, chunkh, _dstream);
      // Only one instrument chunk is allowed
      instrumentChunkSeen = true;
    } else if ("mext".equals(id)) {
      if (mpegChunkSeen) {
        dupChunkError(info, "MPEG");
      }
      chunk = new MpegChunk(this, chunkh, _dstream);
      // I think only one MPEG chunk is allowed
      mpegChunkSeen = true;
    } else if ("cart".equals(id)) {
      if (cartChunkSeen) {
        dupChunkError(info, "Cart");
      }
      chunk = new CartChunk(this, chunkh, _dstream);
      cartChunkSeen = true;
    } else if ("bext".equals(id)) {
      if (broadcastExtChunkSeen) {
        dupChunkError(info, "Broadcast Audio Extension");
      }
      chunk = new BroadcastExtChunk(this, chunkh, _dstream);
      broadcastExtChunkSeen = true;
    } else if ("levl".equals(id)) {
      if (peakChunkSeen) {
        dupChunkError(info, "Peak Envelope");
      }
      chunk = new PeakEnvelopeChunk(this, chunkh, _dstream);
      peakChunkSeen = true;
    } else if ("link".equals(id)) {
      if (linkChunkSeen) {
        dupChunkError(info, "Link");
      }
      chunk = new LinkChunk(this, chunkh, _dstream);
      linkChunkSeen = true;
    } else if ("cue ".equals(id)) {
      if (cueChunkSeen) {
        dupChunkError(info, "Cue");
      }
      chunk = new CueChunk(this, chunkh, _dstream);
      cueChunkSeen = true;
    } else {
      info.setMessage(new InfoMessage("Chunk type '" + id + "' ignored", _nByte));
    }

    if (chunk != null) {
      if (!chunk.readChunk(info)) {
        return false;
      }
    } else {
      // Other chunk types are legal, just skip over them
      skipBytes(_dstream, chunkSize, this);
    }

    if ((chunkSize & 1) != 0) {
      // Must come out to an even byte boundary
      skipBytes(_dstream, 1, this);
      --bytesRemaining;
    }
    return true;
  }
示例#6
0
  /**
   * Parses the content of a purported WAVE digital object and stores the results in RepInfo.
   *
   * @param stream An InputStream, positioned at its beginning, which is generated from the object
   *     to be parsed
   * @param info A fresh RepInfo object which will be modified to reflect the results of the parsing
   * @param parseIndex Must be 0 in first call to <code>parse</code>. If <code>parse</code> returns
   *     a nonzero value, it must be called again with <code>parseIndex</code> equal to that return
   *     value.
   */
  public int parse(InputStream stream, RepInfo info, int parseIndex) throws IOException {
    initParse();
    info.setFormat(_format[0]);
    info.setMimeType(_mimeType[0]);
    info.setModule(this);

    _aesMetadata.setPrimaryIdentifier(info.getUri());
    if (info.getURLFlag()) {
      _aesMetadata.setOtherPrimaryIdentifierType("URI");
    } else {
      _aesMetadata.setPrimaryIdentifierType(AESAudioMetadata.FILE_NAME);
    }

    /* We may have already done the checksums while converting a
    temporary file. */
    _ckSummer = null;
    if (_je != null && _je.getChecksumFlag() && info.getChecksum().size() == 0) {
      _ckSummer = new Checksummer();
      _cstream = new ChecksumInputStream(stream, _ckSummer);
      _dstream = getBufferedDataStream(_cstream, _je != null ? _je.getBufferSize() : 0);
    } else {
      _dstream = getBufferedDataStream(stream, _je != null ? _je.getBufferSize() : 0);
    }

    try {
      // Check the start of the file for the right opening bytes
      for (int i = 0; i < 4; i++) {
        int ch = readUnsignedByte(_dstream, this);
        if (ch != sigByte[i]) {
          info.setMessage(new ErrorMessage("Document does not start with RIFF chunk", 0));
          info.setWellFormed(false);
          return 0;
        }
      }
      /* If we got this far, take note that the signature is OK. */
      info.setSigMatch(_name);

      // Get the length of the Form chunk.  This includes all
      // the subsequent chunks in the file, but excludes the
      // header ("FORM" and the length itself).
      bytesRemaining = readUnsignedInt(_dstream);

      // Read the file type.
      String typ = read4Chars(_dstream);
      bytesRemaining -= 4;
      if (!"WAVE".equals(typ)) {
        info.setMessage(new ErrorMessage("File type in RIFF header is not WAVE", _nByte));
        info.setWellFormed(false);
        return 0;
      }

      while (bytesRemaining > 0) {
        if (!readChunk(info)) {
          break;
        }
      }
    } catch (EOFException e) {
      info.setWellFormed(false);
      info.setMessage(new ErrorMessage("Unexpected end of file", _nByte));
      return 0;
    }

    // Set duration from number of samples and rate.
    if (numSamples > 0) {
      // _aesMetadata.setDuration((double) numSamples / sampleRate);
      _aesMetadata.setDuration(numSamples);
    }

    // Add note and label properties, if there's anything
    // to report.
    if (!_labels.isEmpty()) {
      _propList.add(new Property("Labels", PropertyType.PROPERTY, PropertyArity.LIST, _labels));
    }
    if (!_labeledText.isEmpty()) {
      _propList.add(
          new Property("LabeledText", PropertyType.PROPERTY, PropertyArity.LIST, _labeledText));
    }
    if (!_notes.isEmpty()) {
      _propList.add(new Property("Notes", PropertyType.PROPERTY, PropertyArity.LIST, _notes));
    }
    if (!_samples.isEmpty()) {
      _propList.add(new Property("Samples", PropertyType.PROPERTY, PropertyArity.LIST, _samples));
    }
    if (_exifInfo != null) {
      _propList.add(_exifInfo.buildProperty());
    }
    if (!formatChunkSeen) {
      info.setMessage(new ErrorMessage("No Format Chunk"));
      info.setWellFormed(false);
      return 0;
    }

    /* This file looks OK. */
    if (_ckSummer != null) {
      /* We may not have actually hit the end of file. If we're calculating
       * checksums on the fly, we have to read and discard whatever is
       * left, so it will get checksummed. */
      for (; ; ) {
        try {
          int n = skipBytes(_dstream, 2048, this);
          if (n == 0) {
            break;
          }
        } catch (Exception e) {
          break;
        }
      }
      info.setSize(_cstream.getNBytes());
      info.setChecksum(new Checksum(_ckSummer.getCRC32(), ChecksumType.CRC32));
      String value = _ckSummer.getMD5();
      if (value != null) {
        info.setChecksum(new Checksum(value, ChecksumType.MD5));
      }
      if ((value = _ckSummer.getSHA1()) != null) {
        info.setChecksum(new Checksum(value, ChecksumType.SHA1));
      }
    }

    info.setProperty(_metadata);

    // Indicate satisfied profiles.
    if (flagPCMWaveFormat) {
      info.setProfile("PCMWAVEFORMAT");
    }
    if (flagWaveFormatEx) {
      info.setProfile("WAVEFORMATEX");
    }
    if (flagWaveFormatExtensible) {
      info.setProfile("WAVEFORMATEXTENSIBLE");
    }
    if (flagBroadcastWave) {
      // Need to do some additional checks.
      if (!broadcastExtChunkSeen) {
        flagBroadcastWave = false;
      }
      if (compressionCode == FormatChunk.WAVE_FORMAT_MPEG) {
        if (!broadcastExtChunkSeen || !factChunkSeen) {
          flagBroadcastWave = false;
        }
      }
      if (flagBroadcastWave) {
        String prof = null;
        switch (broadcastVersion) {
          case 0:
            prof = "Broadcast Wave Version 0";
            break;

          case 1:
            prof = "Broadcast Wave Version 1";
            break;

            // Other versions are unknown at this time
        }
        if (prof != null) {
          info.setProfile(prof);
        }
      }
    }
    return 0;
  }