Beispiel #1
0
  /**
   * Reads an array of strings from the TIFF file.
   *
   * @param count Number of strings to read
   * @param value Offset from which to read
   */
  protected String[] readASCIIArray(long count, long value) throws IOException {
    _raf.seek(value);

    int nstrs = 0;
    List list = new LinkedList();
    byte[] buf = new byte[(int) count];
    _raf.read(buf);
    StringBuffer strbuf = new StringBuffer();
    for (int i = 0; i < count; i++) {
      int b = buf[i];
      if (b == 0) {
        list.add(strbuf.toString());
        strbuf.setLength(0);
      } else {
        strbuf.append((char) b);
      }
    }
    /* We can't use ArrayList.toArray because that returns an
    Object[], not a String[] ... sigh. */
    String[] strs = new String[nstrs];
    ListIterator iter = list.listIterator();
    for (int i = 0; i < nstrs; i++) {
      strs[i] = (String) iter.next();
    }
    return strs;
  }
  /**
   * Reads a chunk and puts an Instrument property into the RepInfo object.
   *
   * @return <code>false</code> if the chunk is structurally invalid, otherwise <code>true</code>
   */
  public boolean readChunk(RepInfo info) throws IOException {
    AiffModule module = (AiffModule) _module;
    int baseNote = ModuleBase.readUnsignedByte(_dstream, module);
    int detune = ModuleBase.readSignedByte(_dstream, module);
    int lowNote = ModuleBase.readUnsignedByte(_dstream, module);
    int highNote = ModuleBase.readUnsignedByte(_dstream, module);
    int lowVelocity = ModuleBase.readUnsignedByte(_dstream, module);
    int highVelocity = ModuleBase.readUnsignedByte(_dstream, module);
    int gain = module.readSignedShort(_dstream);
    Loop sustainLoop = readLoop(module);
    Loop releaseLoop = readLoop(module);

    List propList = new ArrayList(9);
    propList.add(new Property("BaseNote", PropertyType.INTEGER, new Integer(baseNote)));
    propList.add(new Property("Detune", PropertyType.INTEGER, new Integer(detune)));
    propList.add(new Property("LowNote", PropertyType.INTEGER, new Integer(lowNote)));
    propList.add(new Property("HighNote", PropertyType.INTEGER, new Integer(highNote)));
    propList.add(new Property("LowVelocity", PropertyType.INTEGER, new Integer(lowVelocity)));
    propList.add(new Property("HighVelocity", PropertyType.INTEGER, new Integer(highVelocity)));
    propList.add(new Property("Gain", PropertyType.INTEGER, new Integer(gain)));
    propList.add(sustainLoop.loopProp("SustainLoop"));
    propList.add(releaseLoop.loopProp("ReleaseLoop"));
    module.addAiffProperty(
        new Property("Instrument", PropertyType.PROPERTY, PropertyArity.LIST, propList));
    return true;
  }
 /*  See if this document, even if it lacks a doctype, is most likely
  *  XHTML.  The test is that the document starts with an XML declaration
  *  and has "html" for its first tag.
  *
  *  Returns:
  *     0 if there's no XML declaration
  *     1 if there's an XML declaration but no html tag; in this
  *       case it's probably some other kind of XML
  *     2 if there's an XML declaration and an html tag
  *
  */
 protected int seemsToBeXHTML(List elements) {
   JHElement elem;
   try {
     elem = (JHElement) elements.get(0);
     if (!(elem instanceof JHXmlDecl)) {
       return 0;
     }
     Iterator iter = elements.iterator();
     while (iter.hasNext()) {
       elem = (JHElement) iter.next();
       if (elem instanceof JHOpenTag) {
         JHOpenTag tag = (JHOpenTag) elem;
         return ("html".equals(tag.getName()) ? 2 : 1);
       }
     }
   } catch (Exception e) {
     return 0; // document must be really empty
   }
   return 1;
 }
Beispiel #4
0
  /**
   * Returns a Property representing a bitmask. If <code>rawOutput</code> is true, returns a LIST
   * property whose elements are STRING properties. The string values of these STRING properties are
   * the elements of <code>labels</code> whose indices correspond to 1 bits in the bitmask, counting
   * the low-order bit as bit 0. if <code>rawOutput</code> is false, returns a LONG property whose
   * numeric value is <code>value</code>.
   */
  protected Property addBitmaskProperty(
      String name, long value, String[] labels, boolean rawOutput) {
    Property prop = null;
    if (!rawOutput) {
      List list = new LinkedList();
      try {
        for (int i = 0; i < labels.length; i++) {
          if ((value & (1 << i)) != 0) {
            list.add(labels[i]);
          }
        }
      } catch (Exception e) {
        _errors.add(name + " value out of range: " + value);
      }
      prop = new Property(name, PropertyType.STRING, PropertyArity.LIST, list);
    }
    if (prop == null) {
      prop = new Property(name, PropertyType.LONG, new Long(value));
    }

    return prop;
  }
 /* Check if there is a DOCTYPE at the start of the elements
  * list.  If there is, return the appropriate version string.
  * If the DOCTYPE says it isn't HTML, trust it and call this
  * document ill-formed by returning -1.
  * If there is no DOCTYPE, or an unrecognized one, return 0.
  */
 protected int checkDoctype(List elements) {
   JHElement firstElem = (JHElement) elements.get(0);
   if (firstElem instanceof JHXmlDecl && elements.size() >= 2) {
     firstElem = (JHElement) elements.get(1);
   }
   if (!(firstElem instanceof JHDoctype)) {
     return 0; // no DOCTYPE found
   }
   List dt = ((JHDoctype) firstElem).getDoctypeElements();
   if (dt.size() < 3) {
     return 0;
   }
   try {
     // Is DOCTYPE case sensitive?  Assume not.
     String str = ((String) dt.get(0)).toUpperCase();
     if (!"HTML".equals(str)) {
       // It's not HTML
       return -1;
     }
     str = ((String) dt.get(1)).toUpperCase();
     if (!"PUBLIC".equals(str)) {
       return 0;
     }
     str = stripQuotes(((String) dt.get(2)).toUpperCase());
     _doctype = str;
     if ("-//W3C//DTD HTML 3.2 FINAL//EN".equals(str) || "-//W3C//DTD HTML 3.2//EN".equals(str)) {
       return HTML_3_2;
     } else if ("-//W3C//DTD HTML 4.0//EN".equals(str)) {
       return HTML_4_0_STRICT;
     } else if ("-//W3C//DTD HTML 4.0 TRANSITIONAL//EN".equals(str)) {
       return HTML_4_0_TRANSITIONAL;
     } else if ("-//W3C//DTD HTML 4.0 FRAMESET//EN".equals(str)) {
       return HTML_4_0_FRAMESET;
     } else if ("-//W3C//DTD HTML 4.01//EN".equals(str)) {
       return HTML_4_01_STRICT;
     } else if ("-//W3C//DTD HTML 4.01 TRANSITIONAL//EN".equals(str)) {
       return HTML_4_01_TRANSITIONAL;
     } else if ("-//W3C//DTD HTML 4.01 FRAMESET//EN".equals(str)) {
       return HTML_4_01_FRAMESET;
     }
   } catch (Exception e) {
     // Really shouldn't happen, but if it does we've got
     // a bad doctype
     return 0;
   }
   return 0;
 }
Beispiel #6
0
 /**
  * General function for adding a property with a 32-bit value, with two arrays of Strings to
  * interpret 0 and 1 values as a bitmask.
  *
  * @param val The bitmask
  * @param name The name for the Property
  * @param oneValueNames Array of names to use for '1' values
  * @param zeroValueNames Array of names to use for '0' values
  */
 public Property buildBitmaskProperty(
     int val, String name, String[] oneValueNames, String[] zeroValueNames) {
   if (_je != null && _je.getShowRawFlag()) {
     return new Property(name, PropertyType.INTEGER, new Integer(val));
   } else {
     List slist = new LinkedList();
     try {
       for (int i = 0; i < oneValueNames.length; i++) {
         String s = null;
         if ((val & (1 << i)) != 0) {
           s = oneValueNames[i];
         } else {
           s = zeroValueNames[i];
         }
         if (s != null && s.length() > 0) {
           slist.add(s);
         }
       }
     } catch (Exception e) {
       return null;
     }
     return new Property(name, PropertyType.STRING, PropertyArity.LIST, slist);
   }
 }
Beispiel #7
0
  /**
   * Returns an Property representing an integer value. If <code>rawOutput</code> is true, returns
   * an INTEGER property, and <code>labels</code> and <code>index</code> are unused. Otherwise,
   * returns a STRING property, with the string being the element of <code>labels</code> whose index
   * is <code>value</code>.
   */
  protected Property addIntegerProperty(
      String name, int value, String[] labels, boolean rawOutput) {
    Property prop = null;
    if (!rawOutput) {
      try {
        prop = new Property(name, PropertyType.STRING, labels[value]);
      } catch (Exception e) {
        _errors.add(name + " value out of range: " + value);
      }
    }
    if (prop == null) {
      prop = new Property(name, PropertyType.INTEGER, new Integer(value));
    }

    return prop;
  }
Beispiel #8
0
  /**
   * Returns an ARRAY Property representing an integer array. If <code>rawOutput</code> is true, the
   * elements of the property array are INTEGER properties, and <code>labels</code> is unused.
   * Otherwise, the elements of the array are STRING properties, with the elements of <code>value
   * </code> used as indices into <code>labels</code>.
   */
  protected Property addIntegerArrayProperty(
      String name, int[] value, String[] labels, boolean rawOutput) {
    Property prop = null;
    if (!rawOutput) {
      String[] s = new String[value.length];
      for (int i = 0; i < value.length; i++) {
        try {
          s[i] = labels[value[i]];
        } catch (Exception e) {
          _errors.add(name + " value out of range: " + value[i]);
        }
      }
      prop = new Property(name, PropertyType.STRING, PropertyArity.ARRAY, s);
    }
    if (prop == null) {
      prop = new Property(name, PropertyType.INTEGER, PropertyArity.ARRAY, value);
    }

    return prop;
  }
Beispiel #9
0
  /** Initializes the state of the module for parsing. */
  protected void initParse() {
    super.initParse();
    _propList = new LinkedList();
    _notes = new LinkedList();
    _labels = new LinkedList();
    _labeledText = new LinkedList();
    _samples = new LinkedList();
    firstSampleOffsetMarked = false;
    numSamples = 0;

    _metadata = new Property("WAVEMetadata", PropertyType.PROPERTY, PropertyArity.LIST, _propList);
    _aesMetadata = new AESAudioMetadata();
    _aesMetadata.setByteOrder(AESAudioMetadata.LITTLE_ENDIAN);
    _aesMetadata.setAnalogDigitalFlag("FILE_DIGITAL");
    _aesMetadata.setFormat("WAVE");
    _aesMetadata.setUse("OTHER", "JHOVE_validation");
    _aesMetadata.setDirection("NONE");

    _propList.add(new Property("AESAudioMetadata", PropertyType.AESAUDIOMETADATA, _aesMetadata));

    // Most chunk types are allowed to occur only once,
    // and a few must occur exactly once.
    // Clear flags for whether they have been seen.
    formatChunkSeen = false;
    dataChunkSeen = false;
    instrumentChunkSeen = false;
    cartChunkSeen = false;
    mpegChunkSeen = false;
    broadcastExtChunkSeen = false;
    peakChunkSeen = false;
    linkChunkSeen = false;
    cueChunkSeen = false;

    // Initialize profile flags
    flagPCMWaveFormat = false;
    flagWaveFormatEx = false;
    flagWaveFormatExtensible = false;
    flagBroadcastWave = false;
  }
Beispiel #10
0
  /**
   * Returns an Property representing an integer value. If <code>rawOutput</code> is true, returns
   * an INTEGER property, and <code>labels</code> and <code>index</code> are unused. Otherwise,
   * returns a STRING property, with the string being the element of <code>labels</code> whose index
   * is the index of <code>value</code> in <code>index</code>.
   */
  protected Property addIntegerProperty(
      String name, int value, String[] labels, int[] index, boolean rawOutput) {
    Property prop = null;
    if (!rawOutput) {
      int n = -1;
      for (int i = 0; i < index.length; i++) {
        if (value == index[i]) {
          n = i;
          break;
        }
      }
      if (n > -1) {
        prop = new Property(name, PropertyType.STRING, labels[n]);
      } else {
        _errors.add(name + " value out of range: " + value);
      }
    }
    if (prop == null) {
      prop = new Property(name, PropertyType.INTEGER, new Integer(value));
    }

    return prop;
  }
Beispiel #11
0
  /**
   * Parses the content of a purported WAVE digital object and stores the results in RepInfo.
   *
   * @param stream An InputStream, positioned at its beginning, which is generated from the object
   *     to be parsed
   * @param info A fresh RepInfo object which will be modified to reflect the results of the parsing
   * @param parseIndex Must be 0 in first call to <code>parse</code>. If <code>parse</code> returns
   *     a nonzero value, it must be called again with <code>parseIndex</code> equal to that return
   *     value.
   */
  public int parse(InputStream stream, RepInfo info, int parseIndex) throws IOException {
    initParse();
    info.setFormat(_format[0]);
    info.setMimeType(_mimeType[0]);
    info.setModule(this);

    _aesMetadata.setPrimaryIdentifier(info.getUri());
    if (info.getURLFlag()) {
      _aesMetadata.setOtherPrimaryIdentifierType("URI");
    } else {
      _aesMetadata.setPrimaryIdentifierType(AESAudioMetadata.FILE_NAME);
    }

    /* We may have already done the checksums while converting a
    temporary file. */
    _ckSummer = null;
    if (_je != null && _je.getChecksumFlag() && info.getChecksum().size() == 0) {
      _ckSummer = new Checksummer();
      _cstream = new ChecksumInputStream(stream, _ckSummer);
      _dstream = getBufferedDataStream(_cstream, _je != null ? _je.getBufferSize() : 0);
    } else {
      _dstream = getBufferedDataStream(stream, _je != null ? _je.getBufferSize() : 0);
    }

    try {
      // Check the start of the file for the right opening bytes
      for (int i = 0; i < 4; i++) {
        int ch = readUnsignedByte(_dstream, this);
        if (ch != sigByte[i]) {
          info.setMessage(new ErrorMessage("Document does not start with RIFF chunk", 0));
          info.setWellFormed(false);
          return 0;
        }
      }
      /* If we got this far, take note that the signature is OK. */
      info.setSigMatch(_name);

      // Get the length of the Form chunk.  This includes all
      // the subsequent chunks in the file, but excludes the
      // header ("FORM" and the length itself).
      bytesRemaining = readUnsignedInt(_dstream);

      // Read the file type.
      String typ = read4Chars(_dstream);
      bytesRemaining -= 4;
      if (!"WAVE".equals(typ)) {
        info.setMessage(new ErrorMessage("File type in RIFF header is not WAVE", _nByte));
        info.setWellFormed(false);
        return 0;
      }

      while (bytesRemaining > 0) {
        if (!readChunk(info)) {
          break;
        }
      }
    } catch (EOFException e) {
      info.setWellFormed(false);
      info.setMessage(new ErrorMessage("Unexpected end of file", _nByte));
      return 0;
    }

    // Set duration from number of samples and rate.
    if (numSamples > 0) {
      // _aesMetadata.setDuration((double) numSamples / sampleRate);
      _aesMetadata.setDuration(numSamples);
    }

    // Add note and label properties, if there's anything
    // to report.
    if (!_labels.isEmpty()) {
      _propList.add(new Property("Labels", PropertyType.PROPERTY, PropertyArity.LIST, _labels));
    }
    if (!_labeledText.isEmpty()) {
      _propList.add(
          new Property("LabeledText", PropertyType.PROPERTY, PropertyArity.LIST, _labeledText));
    }
    if (!_notes.isEmpty()) {
      _propList.add(new Property("Notes", PropertyType.PROPERTY, PropertyArity.LIST, _notes));
    }
    if (!_samples.isEmpty()) {
      _propList.add(new Property("Samples", PropertyType.PROPERTY, PropertyArity.LIST, _samples));
    }
    if (_exifInfo != null) {
      _propList.add(_exifInfo.buildProperty());
    }
    if (!formatChunkSeen) {
      info.setMessage(new ErrorMessage("No Format Chunk"));
      info.setWellFormed(false);
      return 0;
    }

    /* This file looks OK. */
    if (_ckSummer != null) {
      /* We may not have actually hit the end of file. If we're calculating
       * checksums on the fly, we have to read and discard whatever is
       * left, so it will get checksummed. */
      for (; ; ) {
        try {
          int n = skipBytes(_dstream, 2048, this);
          if (n == 0) {
            break;
          }
        } catch (Exception e) {
          break;
        }
      }
      info.setSize(_cstream.getNBytes());
      info.setChecksum(new Checksum(_ckSummer.getCRC32(), ChecksumType.CRC32));
      String value = _ckSummer.getMD5();
      if (value != null) {
        info.setChecksum(new Checksum(value, ChecksumType.MD5));
      }
      if ((value = _ckSummer.getSHA1()) != null) {
        info.setChecksum(new Checksum(value, ChecksumType.SHA1));
      }
    }

    info.setProperty(_metadata);

    // Indicate satisfied profiles.
    if (flagPCMWaveFormat) {
      info.setProfile("PCMWAVEFORMAT");
    }
    if (flagWaveFormatEx) {
      info.setProfile("WAVEFORMATEX");
    }
    if (flagWaveFormatExtensible) {
      info.setProfile("WAVEFORMATEXTENSIBLE");
    }
    if (flagBroadcastWave) {
      // Need to do some additional checks.
      if (!broadcastExtChunkSeen) {
        flagBroadcastWave = false;
      }
      if (compressionCode == FormatChunk.WAVE_FORMAT_MPEG) {
        if (!broadcastExtChunkSeen || !factChunkSeen) {
          flagBroadcastWave = false;
        }
      }
      if (flagBroadcastWave) {
        String prof = null;
        switch (broadcastVersion) {
          case 0:
            prof = "Broadcast Wave Version 0";
            break;

          case 1:
            prof = "Broadcast Wave Version 1";
            break;

            // Other versions are unknown at this time
        }
        if (prof != null) {
          info.setProfile(prof);
        }
      }
    }
    return 0;
  }
Beispiel #12
0
  /**
   * Reads a chunk and puts a BroadcastAudioExtension Property into the RepInfo object.
   *
   * @return <code>false</code> if the chunk is structurally invalid, otherwise <code>true</code>
   */
  public boolean readChunk(RepInfo info) throws IOException {
    WaveModule module = (WaveModule) _module;
    byte[] buf256 = new byte[256];
    ModuleBase.readByteBuf(_dstream, buf256, module);
    String description = byteBufString(buf256);
    byte[] buf32 = new byte[32];
    ModuleBase.readByteBuf(_dstream, buf32, module);
    String originator = byteBufString(buf32);
    ModuleBase.readByteBuf(_dstream, buf32, module);
    String originatorRef = byteBufString(buf32);
    byte[] buf10 = new byte[10];
    ModuleBase.readByteBuf(_dstream, buf10, module);
    String originationDate = byteBufString(buf10);
    byte[] buf8 = new byte[8];
    ModuleBase.readByteBuf(_dstream, buf8, module);
    String originationTime = byteBufString(buf8);
    // TimeReference is stored as a 64-bit little-endian
    // number -- I think
    long timeReference = module.readSignedLong(_dstream);
    int version = module.readUnsignedShort(_dstream);
    module.setBroadcastVersion(version);
    byte[] smtpe_umid = new byte[64];
    ModuleBase.readByteBuf(_dstream, smtpe_umid, module);
    module.skipBytes(_dstream, 190, module);
    String codingHistory = "";
    if (bytesLeft > 602) {
      byte[] bufCodingHistory = new byte[(int) bytesLeft - 602];
      ModuleBase.readByteBuf(_dstream, bufCodingHistory, module);
      codingHistory = byteBufString(bufCodingHistory);
    }

    // Whew -- we've read the whole thing.  Now make that into a
    // list of Properties.
    List plist = new ArrayList(20);
    if (description.length() > 0) {
      plist.add(new Property("Description", PropertyType.STRING, description));
    }
    if (originator.length() > 0) {
      plist.add(new Property("Originator", PropertyType.STRING, originator));
    }
    if (originationDate.length() > 0) {
      plist.add(new Property("OriginationDate", PropertyType.STRING, originationDate));
    }
    if (originationTime.length() > 0) {
      plist.add(new Property("OriginationTime", PropertyType.STRING, originationTime));
    }
    plist.add(new Property("TimeReference", PropertyType.LONG, new Long(timeReference)));
    plist.add(new Property("Version", PropertyType.INTEGER, new Integer(version)));
    plist.add(new Property("UMID", PropertyType.BYTE, PropertyArity.ARRAY, smtpe_umid));
    if (codingHistory.length() > 0) {
      plist.add(new Property("CodingHistory", PropertyType.STRING, codingHistory));
    }

    module.addWaveProperty(
        new Property("BroadcastAudioExtension", PropertyType.PROPERTY, PropertyArity.LIST, plist));

    // set time reference in AES metadata set @author David Ackerman
    AESAudioMetadata aes = module.getAESMetadata();
    aes.setStartTime(timeReference);

    return true;
  }
Beispiel #13
0
 /** Adds a Label property */
 public void addLabel(Property p) {
   _labels.add(p);
 }
  /**
   * Parse the content of a purported HTML stream digital object and store the results in RepInfo.
   *
   * @param stream An InputStream, positioned at its beginning, which is generated from the object
   *     to be parsed. If multiple calls to <code>parse</code> are made on the basis of a nonzero
   *     value being returned, a new InputStream must be provided each time.
   * @param info A fresh (on the first call) RepInfo object which will be modified to reflect the
   *     results of the parsing If multiple calls to <code>parse</code> are made on the basis of a
   *     nonzero value being returned, the same RepInfo object should be passed with each call.
   * @param parseIndex Must be 0 in first call to <code>parse</code>. If <code>parse</code> returns
   *     a nonzero value, it must be called again with <code>parseIndex</code> equal to that return
   *     value.
   */
  public int parse(InputStream stream, RepInfo info, int parseIndex) throws IOException {
    if (parseIndex != 0) {
      // Coming in with parseIndex = 1 indicates that we've determined
      // this is XHTML; so we invoke the XML module to parse it.
      // If parseIndex is 100, this is the first invocation of the
      // XML module, so we call it with 0; otherwise we call it with
      // the value of parseIndex.
      if (isXmlAvailable()) {
        edu.harvard.hul.ois.jhove.module.XmlModule xmlMod =
            new edu.harvard.hul.ois.jhove.module.XmlModule();
        if (parseIndex == 100) {
          parseIndex = 0;
        }
        xmlMod.setApp(_app);
        xmlMod.setBase(_je);
        xmlMod.setDefaultParams(_defaultParams);
        try {
          xmlMod.applyDefaultParams();
        } catch (Exception e) {
          // really shouldn't happen
        }
        xmlMod.setXhtmlDoctype(_doctype);
        return xmlMod.parse(stream, info, parseIndex);
      } else {
        // The XML module shouldn't be missing from any installation,
        // but someone who really wanted to could remove it.  In
        // that case, you deserve what you get.
        info.setMessage(new ErrorMessage("XML-HUL module required to validate XHTML documents"));
        info.setWellFormed(false); // Treat it as completely wrong
        return 0;
      }
    } else {
      /* parseIndex = 0, first call only */
      _doctype = null;
    }
    // Test if textMD is to be generated
    if (_defaultParams != null) {
      Iterator iter = _defaultParams.iterator();
      while (iter.hasNext()) {
        String param = (String) iter.next();
        if (param.toLowerCase().equals("withtextmd=true")) {
          _withTextMD = true;
        }
      }
    }

    initParse();
    info.setFormat(_format[0]);
    info.setMimeType(_mimeType[0]);
    info.setModule(this);

    if (_textMD == null || parseIndex == 0) {
      _textMD = new TextMDMetadata();
    }
    /* We may have already done the checksums while converting a
    temporary file. */
    Checksummer ckSummer = null;
    if (_je != null && _je.getChecksumFlag() && info.getChecksum().size() == 0) {
      ckSummer = new Checksummer();
      _cstream = new ChecksumInputStream(stream, ckSummer);
      _dstream = getBufferedDataStream(_cstream, _je != null ? _je.getBufferSize() : 0);
    } else {
      _dstream = getBufferedDataStream(stream, _je != null ? _je.getBufferSize() : 0);
    }

    ParseHtml parser = null;
    HtmlMetadata metadata = null;
    HtmlCharStream cstream = null;
    try {
      cstream = new HtmlCharStream(_dstream, "ISO-8859-1");
      parser = new ParseHtml(cstream);
    } catch (UnsupportedEncodingException e) {
      info.setMessage(new ErrorMessage("Internal error: " + e.getMessage()));
      info.setWellFormed(false);
      return 0; // shouldn't happen!
    }
    int type = 0;
    try {
      List elements = parser.HtmlDoc();
      if (elements.isEmpty()) {
        // Consider an empty document bad
        info.setWellFormed(false);
        info.setMessage(new ErrorMessage("Document is empty"));
        return 0;
      }
      type = checkDoctype(elements);
      if (type < 0) {
        info.setWellFormed(false);
        info.setMessage(new ErrorMessage("DOCTYPE is not HTML"));
        return 0;
      }
      /* Check if there is at least one html, head, body or title tag.
       * A plain text document
       * might be interpreted as a single PCDATA, which is in some
       * ethereal sense well-formed HTML, but it's pointless to consider
       * it such.  It might also use angle brackets as a text delimiter,
       * and that shouldn't count as HTML either. */
      boolean hasElements = false;
      Iterator iter = elements.iterator();
      while (iter.hasNext()) {
        Object o = iter.next();
        if (o instanceof JHOpenTag) {
          String name = ((JHOpenTag) o).getName();
          if ("html".equals(name)
              || "head".equals(name)
              || "body".equals(name)
              || "title".equals(name)) {
            hasElements = true;
          }
          break;
        }
      }
      if (!hasElements) {
        info.setMessage(new ErrorMessage("Document contains no html, head, body or title tags"));
        info.setWellFormed(false);
        return 0;
      }

      // CRLF from HtmlCharStream ...
      String lineEnd = cstream.getKindOfLineEnd();
      if (lineEnd == null) {
        info.setMessage(new InfoMessage("Not able to determine type of end of line"));
        _textMD.setLinebreak(TextMDMetadata.NILL);
      } else if (lineEnd.equalsIgnoreCase("CR")) {
        _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CR);
      } else if (lineEnd.equalsIgnoreCase("LF")) {
        _textMD.setLinebreak(TextMDMetadata.LINEBREAK_LF);
      } else if (lineEnd.equalsIgnoreCase("CRLF")) {
        _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CRLF);
      }

      if (type == 0) {
        /* If we can't find a doctype, it still might be XHTML
         * if the elements start with an XML declaration and
         * the root element is "html" */
        switch (seemsToBeXHTML(elements)) {
          case 0: // Not XML
            break; // fall through
          case 1: // XML but not HTML
            info.setMessage(
                new ErrorMessage(
                    "Document has XML declaration but no DOCTYPE; "
                        + "probably XML rather than HTML"));
            info.setWellFormed(false);
            return 0;
          case 2: // probably XHTML
            return 100;
        }
        info.setMessage(
            new ErrorMessage(
                "Unrecognized or missing DOCTYPE declaration; "
                    + "validation continuing as HTML 3.2"));
        info.setValid(false);
        // But keep going
      }

      HtmlDocDesc docDesc = null;
      switch (type) {
        case HTML_3_2:
        default:
          docDesc = new Html3_2DocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("3.2");
          break;

        case HTML_4_0_FRAMESET:
          docDesc = new Html4_0FrameDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.0");
          break;
        case HTML_4_0_TRANSITIONAL:
          docDesc = new Html4_0TransDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.0");
          break;
        case HTML_4_0_STRICT:
          docDesc = new Html4_0StrictDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.0");
          break;
        case HTML_4_01_FRAMESET:
          docDesc = new Html4_01FrameDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.01");
          break;
        case HTML_4_01_TRANSITIONAL:
          docDesc = new Html4_01TransDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.01");
          break;
        case HTML_4_01_STRICT:
          docDesc = new Html4_01StrictDocDesc();
          _textMD.setMarkup_basis("HTML");
          _textMD.setMarkup_basis_version("4.01");
          break;
        case XHTML_1_0_STRICT:
        case XHTML_1_0_TRANSITIONAL:
        case XHTML_1_0_FRAMESET:
        case XHTML_1_1:
          // Force a second call to parse as XML. 100 is a
          // magic code for the first XML call.
          return 100;
      }
      _textMD.setMarkup_language(_doctype);
      if (docDesc == null) {
        info.setMessage(
            new InfoMessage(
                "Code for appropriate HTML version not available yet:" + "substituting HTML 3.2"));
        docDesc = new Html3_2DocDesc();
      }
      docDesc.validate(elements, info);
      metadata = docDesc.getMetadata();

      // Try to get the charset from the meta Content
      if (metadata.getCharset() != null) {
        _textMD.setCharset(metadata.getCharset());
      } else {
        _textMD.setCharset(TextMDMetadata.CHARSET_ISO8859_1);
      }
      String textMDEncoding = _textMD.getCharset();
      if (textMDEncoding.indexOf("UTF") != -1) {
        _textMD.setByte_order(
            _bigEndian ? TextMDMetadata.BYTE_ORDER_BIG : TextMDMetadata.BYTE_ORDER_LITTLE);
        _textMD.setByte_size("8");
        _textMD.setCharacter_size("variable");
      } else {
        _textMD.setByte_order(
            _bigEndian ? TextMDMetadata.BYTE_ORDER_BIG : TextMDMetadata.BYTE_ORDER_LITTLE);
        _textMD.setByte_size("8");
        _textMD.setCharacter_size("1");
      }
    } catch (ParseException e) {
      Token t = e.currentToken;
      info.setMessage(
          new ErrorMessage("Parse error", "Line = " + t.beginLine + ", column = " + t.beginColumn));
      info.setWellFormed(false);
    } catch (TokenMgrError f) {
      info.setMessage(new ErrorMessage("TokenMgrError: " + f.getLocalizedMessage()));
      info.setWellFormed(false);
    }

    if (info.getWellFormed() == RepInfo.FALSE) {
      return 0;
    }

    if (type != 0) {
      if (profileNames[type] != null) {
        info.setProfile(profileNames[type]);
      }
      info.setVersion(versionNames[type]);
    }

    if (metadata != null) {
      Property property = metadata.toProperty(_withTextMD ? _textMD : null);
      if (property != null) {
        info.setProperty(property);
      }
    }

    if (ckSummer != null) {
      info.setSize(_cstream.getNBytes());
      info.setChecksum(new Checksum(ckSummer.getCRC32(), ChecksumType.CRC32));
      String value = ckSummer.getMD5();
      if (value != null) {
        info.setChecksum(new Checksum(value, ChecksumType.MD5));
      }
      if ((value = ckSummer.getSHA1()) != null) {
        info.setChecksum(new Checksum(value, ChecksumType.SHA1));
      }
    }

    return 0;
  }
Beispiel #15
0
 /** Adds a LabeledText property */
 public void addLabeledText(Property p) {
   _labeledText.add(p);
 }
Beispiel #16
0
 /** Adds a Property to the WAVE metadata. */
 public void addWaveProperty(Property prop) {
   _propList.add(prop);
 }
Beispiel #17
0
 /** Adds the ListInfo property, which is a List of String Properties. */
 public void addListInfo(List l) {
   _propList.add(new Property("ListInfo", PropertyType.PROPERTY, PropertyArity.LIST, l));
 }
Beispiel #18
0
 /** Adds a Note string */
 public void addNote(Property p) {
   _notes.add(p);
 }
Beispiel #19
0
 /** Adds a Sample property */
 public void addSample(Property p) {
   _samples.add(p);
 }