/** * Returns a hex enccoded SHA1 hash of the whole file. This can be used to locate the files bytes * again * * @param in * @param hashStore * @param blobStore * @return * @throws IOException */ public String parse(InputStream in, HashStore hashStore, BlobStore blobStore) throws IOException { if (log.isInfoEnabled()) { log.info("parse. inputstream: " + in); } Rsum rsum = new Rsum(128); int numBlobs = 0; byte[] arr = new byte[1024]; ByteArrayOutputStream bout = new ByteArrayOutputStream(); List<String> blobHashes = new ArrayList<>(); MessageDigest blobCrc = getCrypt(); MessageDigest fanoutCrc = getCrypt(); MessageDigest fileCrc = getCrypt(); long fanoutLength = 0; long fileLength = 0; int s = in.read(arr, 0, 1024); if (log.isTraceEnabled()) { log.trace("initial block size: " + s); } List<String> fanoutHashes = new ArrayList<>(); while (s >= 0) { numBytes += s; // log.trace("numBytes: {}", numBytes); if (cancelled) { throw new IOException("operation cancelled"); } for (int i = 0; i < s; i++) { byte b = arr[i]; rsum.roll(b); blobCrc.update(b); fanoutCrc.update(b); fileCrc.update(b); fanoutLength++; fileLength++; bout.write(b); int x = rsum.getValue(); // System.out.println("x=" + x); // System.out.println("check mask: " + (x & MASK) + " == " + MASK); boolean limited; if (MAX_BLOB_SIZE != null) { limited = bout.size() > MAX_BLOB_SIZE; if (limited) { log.warn("HIT BLOB LIMIT: " + bout.size()); } } else { limited = false; } if (((x & MASK) == MASK) || limited) { String blobCrcHex = toHex(blobCrc); byte[] blobBytes = bout.toByteArray(); if (log.isInfoEnabled()) { log.info( "Store blob: " + blobCrcHex + " length=" + blobBytes.length + " hash: " + x + " mask: " + MASK); } blobStore.setBlob(blobCrcHex, blobBytes); bout.reset(); blobHashes.add(blobCrcHex); blobCrc.reset(); if ((x & FANOUT_MASK) == FANOUT_MASK) { String fanoutCrcVal = toHex(fanoutCrc); fanoutHashes.add(fanoutCrcVal); // log.info("set chunk fanout: {} length={}", fanoutCrcVal, fanoutLength); hashStore.setChunkFanout(fanoutCrcVal, blobHashes, fanoutLength); fanoutLength = 0; fanoutCrc.reset(); blobHashes = new ArrayList<>(); } numBlobs++; rsum.reset(); } } s = in.read(arr, 0, 1024); } // Need to store terminal data, ie data which has been accumulated since the last boundary String blobCrcHex = toHex(blobCrc); // System.out.println("Store terminal blob: " + blobCrcHex); blobStore.setBlob(blobCrcHex, bout.toByteArray()); numBlobs++; blobHashes.add(blobCrcHex); String fanoutCrcVal = toHex(fanoutCrc); // log.info("set terminal chunk fanout: {} length={}" ,fanoutCrcVal, fanoutLength); hashStore.setChunkFanout(fanoutCrcVal, blobHashes, fanoutLength); fanoutHashes.add(fanoutCrcVal); // Now store a fanout for the whole file. The contained hashes locate other fanouts String fileCrcVal = toHex(fileCrc); if (log.isInfoEnabled()) { log.info( "set file fanout: " + fanoutCrcVal + " length=" + fileLength + " avg blob size=" + fileLength / numBlobs); } hashStore.setFileFanout(fileCrcVal, fanoutHashes, fileLength); return fileCrcVal; }