/** * read a < hex string >. The initial < has already been read. * * @param objNum the object number of the object containing the dictionary being read; negative * only if the object number is unavailable, which should only happen if we're reading a * string placed directly in the trailer * @param objGen the object generation of the object containing the object being read; negative * only if the objNum is unavailable * @param decrypter the decrypter to use */ private PDFObject readHexString(int objNum, int objGen, PDFDecrypter decrypter) throws IOException { // we've already read the <. Now get the hex bytes until > int val; StringBuffer sb = new StringBuffer(); while ((val = readHexPair()) >= 0) { sb.append((char) val); } if (this.buf.get() != '>') { throw new PDFParseException("Bad character in Hex String"); } return new PDFObject( this, PDFObject.STRING, decrypter.decryptString(objNum, objGen, sb.toString())); }
/** * read a ( character string ). The initial ( has already been read. Read until a *balanced* ) * appears. * * <p>Section 3.2.3 of PDF Refernce version 1.7 defines the format of String objects. Regarding * literal strings: * * <blockquote> * * Within a literal string, the backslash (\) is used as an escape character for various purposes, * such as to include newline characters, nonprinting ASCII characters, unbalanced parentheses, or * the backslash character itself in the string. The character immediately following the backslash * determines its precise interpretation (see Table 3.2). If the character following the backslash * is not one of those shown in the table, the backslash is ignored. * * </blockquote> * * * * * <p>This only reads 8 bit basic character 'strings' so as to avoid a text string interpretation * when one is not desired (e.g., for byte strings, as used by the decryption mechanism). For an * interpretation of a string returned from this method, where the object type is defined as a * 'text string' as per Section 3.8.1, Table 3.31 "PDF Data Types", {@link * PDFStringUtil#asTextString} ()} or {@link PDFObject#getTextStringValue()} must be employed. * * @param objNum the object number of the object containing the dictionary being read; negative * only if the object number is unavailable, which should only happen if we're reading a * dictionary placed directly in the trailer * @param objGen the object generation of the object containing the object being read; negative * only if the objNum is unavailable * @param decrypter the decrypter to use */ private PDFObject readLiteralString(int objNum, int objGen, PDFDecrypter decrypter) throws IOException { int c; // we've already read the (. now get the characters until a // *balanced* ) appears. Translate \r \n \t \b \f \( \) \\ \ddd // if a cr/lf follows a backslash, ignore the cr/lf int parencount = 1; StringBuffer sb = new StringBuffer(); while (parencount > 0) { c = this.buf.get() & 0xFF; // process unescaped parenthesis if (c == '(') { parencount++; } else if (c == ')') { parencount--; if (parencount == 0) { c = -1; break; } } else if (c == '\\') { // From the spec: // Within a literal string, the backslash (\) is used as an // escape character for various purposes, such as to include // newline characters, nonprinting ASCII characters, // unbalanced parentheses, or the backslash character itself // in the string. The character immediately following the // backslash determines its precise interpretation (see // Table 3.2). If the character following the backslash is not // one of those shown in the table, the backslash is ignored. // // summary of rules: // // \n \r \t \b \f 2-char sequences are used to represent their // 1-char counterparts // // \( and \) are used to escape parenthesis // // \\ for a literal backslash // // \ddd (1-3 octal digits) for a character code // // \<EOL> is used to put formatting newlines into the // file, but aren't actually part of the string; EOL may be // CR, LF or CRLF // // any other sequence should see the backslash ignored // grab the next character to see what we're dealing with c = this.buf.get() & 0xFF; if (c >= '0' && c < '8') { // \ddd form - one to three OCTAL digits int count = 0; int val = 0; while (c >= '0' && c < '8' && count < 3) { val = val * 8 + c - '0'; c = this.buf.get() & 0xFF; count++; } // we'll have read one character too many this.buf.position(this.buf.position() - 1); c = val; } else if (c == 'n') { c = '\n'; } else if (c == 'r') { c = '\r'; } else if (c == 't') { c = '\t'; } else if (c == 'b') { c = '\b'; } else if (c == 'f') { c = '\f'; } else if (c == '\r') { // escaped CR to be ignored; look for a following LF c = this.buf.get() & 0xFF; if (c != '\n') { // not an LF, we'll consume this character on // the next iteration this.buf.position(this.buf.position() - 1); } c = -1; } else if (c == '\n') { // escaped LF to be ignored c = -1; } // any other c should be used as is, as it's either // one of ()\ in which case it should be used literally, // or the backslash should just be ignored } if (c >= 0) { sb.append((char) c); } } return new PDFObject( this, PDFObject.STRING, decrypter.decryptString(objNum, objGen, sb.toString())); }