/** * read a < hex string >. The initial < has already been read. * * @param objNum the object number of the object containing the dictionary being read; negative * only if the object number is unavailable, which should only happen if we're reading a * string placed directly in the trailer * @param objGen the object generation of the object containing the object being read; negative * only if the objNum is unavailable * @param decrypter the decrypter to use */ private PDFObject readHexString(int objNum, int objGen, PDFDecrypter decrypter) throws IOException { // we've already read the <. Now get the hex bytes until > int val; StringBuffer sb = new StringBuffer(); while ((val = readHexPair()) >= 0) { sb.append((char) val); } if (this.buf.get() != '>') { throw new PDFParseException("Bad character in Hex String"); } return new PDFObject( this, PDFObject.STRING, decrypter.decryptString(objNum, objGen, sb.toString())); }
/** * read the cross reference table from a PDF file. When this method is called, the file pointer * must point to the start of the word "xref" in the file. Reads the xref table and the trailer * dictionary. If dictionary has a /Prev entry, move file pointer and read new trailer * * @param password */ private void readTrailer(PDFPassword password) throws IOException, PDFAuthenticationFailureException, EncryptionUnsupportedByProductException, EncryptionUnsupportedByPlatformException { // the table of xrefs this.objIdx = new PDFXref[50]; int pos = this.buf.position(); PDFDecrypter newDefaultDecrypter = null; // read a bunch of nested trailer tables while (true) { // make sure we are looking at an xref table if (!nextItemIs("xref")) { this.buf.position(pos); readTrailer15(password); return; // throw new PDFParseException("Expected 'xref' at start of table"); } // read a bunch of linked tabled while (true) { // read until the word "trailer" PDFObject obj = readObject(-1, -1, IdentityDecrypter.getInstance()); if (obj.getType() == PDFObject.KEYWORD && obj.getStringValue().equals("trailer")) { break; } // read the starting position of the reference if (obj.getType() != PDFObject.NUMBER) { throw new PDFParseException("Expected number for first xref entry"); } int refstart = obj.getIntValue(); // read the size of the reference table obj = readObject(-1, -1, IdentityDecrypter.getInstance()); if (obj.getType() != PDFObject.NUMBER) { throw new PDFParseException("Expected number for length of xref table"); } int reflen = obj.getIntValue(); // skip a line readLine(); if (refstart == 1) { // Check and try to fix incorrect Object Number Start int startPos = this.buf.position(); try { byte[] refline = new byte[20]; this.buf.get(refline); if (refline[17] == 'f') { // free PDFXref objIndex = new PDFXref(refline); if (objIndex.getID() == 0 && objIndex.getGeneration() == 65535) { // The highest generation number possible refstart--; } } } catch (Exception e) { // in case of error ignore } this.buf.position(startPos); } // extend the objIdx table, if necessary if (refstart + reflen >= this.objIdx.length) { PDFXref nobjIdx[] = new PDFXref[refstart + reflen]; System.arraycopy(this.objIdx, 0, nobjIdx, 0, this.objIdx.length); this.objIdx = nobjIdx; } // read reference lines for (int refID = refstart; refID < refstart + reflen; refID++) { // each reference line is 20 bytes long byte[] refline = new byte[20]; this.buf.get(refline); // ignore this line if the object ID is already defined if (this.objIdx[refID] != null) { continue; } // see if it's an active object if (refline[17] == 'n') { this.objIdx[refID] = new PDFXref(refline); } else { this.objIdx[refID] = new PDFXref(null); } } } // at this point, the "trailer" word (not EOL) has been read. PDFObject trailerdict = readObject(-1, -1, IdentityDecrypter.getInstance()); if (trailerdict.getType() != PDFObject.DICTIONARY) { throw new IOException("Expected dictionary after \"trailer\""); } // read the root object location if (this.root == null) { this.root = trailerdict.getDictRef("Root"); if (this.root != null) { this.root.setObjectId(PDFObject.OBJ_NUM_TRAILER, PDFObject.OBJ_NUM_TRAILER); } } // read the encryption information if (this.encrypt == null) { this.encrypt = trailerdict.getDictRef("Encrypt"); if (this.encrypt != null) { this.encrypt.setObjectId(PDFObject.OBJ_NUM_TRAILER, PDFObject.OBJ_NUM_TRAILER); } if (this.encrypt != null && !PDFDecrypterFactory.isFilterExist(this.encrypt)) { this.encrypt = null; // the filter is not located at this trailer, we will try later again } else { newDefaultDecrypter = PDFDecrypterFactory.createDecryptor( this.encrypt, trailerdict.getDictRef("ID"), password); } } if (this.info == null) { this.info = trailerdict.getDictRef("Info"); if (this.info != null) { if (!this.info.isIndirect()) { throw new PDFParseException("Info in trailer must be an indirect reference"); } this.info.setObjectId(PDFObject.OBJ_NUM_TRAILER, PDFObject.OBJ_NUM_TRAILER); } } // support for hybrid-PDFs containing an additional compressed-xref-stream PDFObject xrefstmPos = trailerdict.getDictRef("XRefStm"); if (xrefstmPos != null) { int pos14 = this.buf.position(); this.buf.position(xrefstmPos.getIntValue()); readTrailer15(password); this.buf.position(pos14); } // read the location of the previous xref table PDFObject prevloc = trailerdict.getDictRef("Prev"); if (prevloc != null) { this.buf.position(prevloc.getIntValue()); } else { break; } // see if we have an optional Version entry if (this.root.getDictRef("Version") != null) { processVersion(this.root.getDictRef("Version").getStringValue()); } } // make sure we found a root if (this.root == null) { throw new PDFParseException("No /Root key found in trailer dictionary"); } if (this.encrypt != null && newDefaultDecrypter != null) { PDFObject permissions = this.encrypt.getDictRef("P"); if (permissions != null && !newDefaultDecrypter.isOwnerAuthorised()) { int perms = permissions != null ? permissions.getIntValue() : 0; if (permissions != null) { this.printable = (perms & 4) != 0; this.saveable = (perms & 16) != 0; } } // Install the new default decrypter only after the trailer has // been read, as nothing we're reading passing through is encrypted this.defaultDecrypter = newDefaultDecrypter; } // dereference the root object this.root.dereference(); }
/** * read a ( character string ). The initial ( has already been read. Read until a *balanced* ) * appears. * * <p>Section 3.2.3 of PDF Refernce version 1.7 defines the format of String objects. Regarding * literal strings: * * <blockquote> * * Within a literal string, the backslash (\) is used as an escape character for various purposes, * such as to include newline characters, nonprinting ASCII characters, unbalanced parentheses, or * the backslash character itself in the string. The character immediately following the backslash * determines its precise interpretation (see Table 3.2). If the character following the backslash * is not one of those shown in the table, the backslash is ignored. * * </blockquote> * * * * * <p>This only reads 8 bit basic character 'strings' so as to avoid a text string interpretation * when one is not desired (e.g., for byte strings, as used by the decryption mechanism). For an * interpretation of a string returned from this method, where the object type is defined as a * 'text string' as per Section 3.8.1, Table 3.31 "PDF Data Types", {@link * PDFStringUtil#asTextString} ()} or {@link PDFObject#getTextStringValue()} must be employed. * * @param objNum the object number of the object containing the dictionary being read; negative * only if the object number is unavailable, which should only happen if we're reading a * dictionary placed directly in the trailer * @param objGen the object generation of the object containing the object being read; negative * only if the objNum is unavailable * @param decrypter the decrypter to use */ private PDFObject readLiteralString(int objNum, int objGen, PDFDecrypter decrypter) throws IOException { int c; // we've already read the (. now get the characters until a // *balanced* ) appears. Translate \r \n \t \b \f \( \) \\ \ddd // if a cr/lf follows a backslash, ignore the cr/lf int parencount = 1; StringBuffer sb = new StringBuffer(); while (parencount > 0) { c = this.buf.get() & 0xFF; // process unescaped parenthesis if (c == '(') { parencount++; } else if (c == ')') { parencount--; if (parencount == 0) { c = -1; break; } } else if (c == '\\') { // From the spec: // Within a literal string, the backslash (\) is used as an // escape character for various purposes, such as to include // newline characters, nonprinting ASCII characters, // unbalanced parentheses, or the backslash character itself // in the string. The character immediately following the // backslash determines its precise interpretation (see // Table 3.2). If the character following the backslash is not // one of those shown in the table, the backslash is ignored. // // summary of rules: // // \n \r \t \b \f 2-char sequences are used to represent their // 1-char counterparts // // \( and \) are used to escape parenthesis // // \\ for a literal backslash // // \ddd (1-3 octal digits) for a character code // // \<EOL> is used to put formatting newlines into the // file, but aren't actually part of the string; EOL may be // CR, LF or CRLF // // any other sequence should see the backslash ignored // grab the next character to see what we're dealing with c = this.buf.get() & 0xFF; if (c >= '0' && c < '8') { // \ddd form - one to three OCTAL digits int count = 0; int val = 0; while (c >= '0' && c < '8' && count < 3) { val = val * 8 + c - '0'; c = this.buf.get() & 0xFF; count++; } // we'll have read one character too many this.buf.position(this.buf.position() - 1); c = val; } else if (c == 'n') { c = '\n'; } else if (c == 'r') { c = '\r'; } else if (c == 't') { c = '\t'; } else if (c == 'b') { c = '\b'; } else if (c == 'f') { c = '\f'; } else if (c == '\r') { // escaped CR to be ignored; look for a following LF c = this.buf.get() & 0xFF; if (c != '\n') { // not an LF, we'll consume this character on // the next iteration this.buf.position(this.buf.position() - 1); } c = -1; } else if (c == '\n') { // escaped LF to be ignored c = -1; } // any other c should be used as is, as it's either // one of ()\ in which case it should be used literally, // or the backslash should just be ignored } if (c >= 0) { sb.append((char) c); } } return new PDFObject( this, PDFObject.STRING, decrypter.decryptString(objNum, objGen, sb.toString())); }
/** * read the cross reference table from a PDF file. When this method is called, the file pointer * must point to the start of the word "xref" in the file. Reads the xref table and the trailer * dictionary. If dictionary has a /Prev entry, move file pointer and read new trailer * * @param password */ private void readTrailer15(PDFPassword password) throws IOException, PDFAuthenticationFailureException, EncryptionUnsupportedByProductException, EncryptionUnsupportedByPlatformException { // the table of xrefs // objIdx is initialized from readTrailer(), do not overwrite here data from hybrid PDFs // objIdx = new PDFXref[50]; PDFDecrypter newDefaultDecrypter = null; while (true) { PDFObject xrefObj = readObject(-1, -1, IdentityDecrypter.getInstance()); PDFObject pdfObject = xrefObj.getDictionary().get("W"); if (pdfObject == null) { break; } PDFObject[] wNums = pdfObject.getArray(); int l1 = wNums[0].getIntValue(); int l2 = wNums[1].getIntValue(); int l3 = wNums[2].getIntValue(); int size = xrefObj.getDictionary().get("Size").getIntValue(); byte[] strmbuf = xrefObj.getStream(); int strmPos = 0; PDFObject idxNums = xrefObj.getDictionary().get("Index"); int[] idxArray; if (idxNums == null) { idxArray = new int[] {0, size}; } else { PDFObject[] idxNumArr = idxNums.getArray(); idxArray = new int[idxNumArr.length]; for (int i = 0; i < idxNumArr.length; i++) { idxArray[i] = idxNumArr[i].getIntValue(); } } int idxLen = idxArray.length; int idxPos = 0; while (idxPos < idxLen) { int refstart = idxArray[idxPos++]; int reflen = idxArray[idxPos++]; // extend the objIdx table, if necessary if (refstart + reflen >= this.objIdx.length) { PDFXref nobjIdx[] = new PDFXref[refstart + reflen]; System.arraycopy(this.objIdx, 0, nobjIdx, 0, this.objIdx.length); this.objIdx = nobjIdx; } // read reference lines for (int refID = refstart; refID < refstart + reflen; refID++) { int type = readNum(strmbuf, strmPos, l1); strmPos += l1; int id = readNum(strmbuf, strmPos, l2); strmPos += l2; int gen = readNum(strmbuf, strmPos, l3); strmPos += l3; // ignore this line if the object ID is already defined if (this.objIdx[refID] != null) { continue; } // see if it's an active object if (type == 0) { // inactive this.objIdx[refID] = new PDFXref(null); } else if (type == 1) { // active uncompressed this.objIdx[refID] = new PDFXref(id, gen); } else { // active compressed this.objIdx[refID] = new PDFXref(id, gen, true); } } } HashMap<String, PDFObject> trailerdict = xrefObj.getDictionary(); // read the root object location if (this.root == null) { this.root = trailerdict.get("Root"); if (this.root != null) { this.root.setObjectId(PDFObject.OBJ_NUM_TRAILER, PDFObject.OBJ_NUM_TRAILER); } } // read the encryption information if (this.encrypt == null) { this.encrypt = trailerdict.get("Encrypt"); if (this.encrypt != null) { this.encrypt.setObjectId(PDFObject.OBJ_NUM_TRAILER, PDFObject.OBJ_NUM_TRAILER); } if (this.encrypt != null && !PDFDecrypterFactory.isFilterExist(this.encrypt)) { this.encrypt = null; // the filter is not located at this trailer, we will try later again } else { newDefaultDecrypter = PDFDecrypterFactory.createDecryptor(this.encrypt, trailerdict.get("ID"), password); } } if (this.info == null) { this.info = trailerdict.get("Info"); if (this.info != null) { if (!this.info.isIndirect()) { throw new PDFParseException("Info in trailer must be an indirect reference"); } this.info.setObjectId(PDFObject.OBJ_NUM_TRAILER, PDFObject.OBJ_NUM_TRAILER); } } // read the location of the previous xref table PDFObject prevloc = trailerdict.get("Prev"); if (prevloc != null) { this.buf.position(prevloc.getIntValue()); } else { break; } // see if we have an optional Version entry if (this.root.getDictRef("Version") != null) { processVersion(this.root.getDictRef("Version").getStringValue()); } } // make sure we found a root if (this.root == null) { throw new PDFParseException("No /Root key found in trailer dictionary"); } // check what permissions are relevant if (this.encrypt != null && newDefaultDecrypter != null) { PDFObject permissions = this.encrypt.getDictRef("P"); if (permissions != null && !newDefaultDecrypter.isOwnerAuthorised()) { int perms = permissions != null ? permissions.getIntValue() : 0; if (permissions != null) { this.printable = (perms & 4) != 0; this.saveable = (perms & 16) != 0; } } // Install the new default decrypter only after the trailer has // been read, as nothing we're reading passing through is encrypted this.defaultDecrypter = newDefaultDecrypter; } // dereference the root object this.root.dereference(); }