public FastaIndexedSequence(String path) throws IOException { this.path = path; contentLength = ParsingUtils.getContentLength(path); String indexPath = path + ".fai"; // The check below is not useful in the files have been copied or moved, which is always the // case for our hosted // genomes. It causes lots of spurious warnings // if(ParsingUtils.getLastModified(path) > ParsingUtils.getLastModified(indexPath)){ // log.warn("Index file for " + path + " is older than the file it indexes"); // } index = new FastaIndex(indexPath); chromoNamesList = new ArrayList<String>(index.getSequenceNames()); }
/** * Return the sequence for the query interval as a byte array. Coordinates are "ucsc" style (0 * based) * * <p>Example: 5 bases per line, 6 bytes per line * * <p>Bases 0 1 2 3 4 * | 5 6 7 8 9 * | 10 11 12 13 14 * etc Offset 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 * Bytes 0 1 2 3 4 5 | 6 7 8 9 10 | 11 12 13 14 15 16 * * <p>query 9 - 13 start line = 1 base0 = 1*5 = 5 offset = (9 - 5) = 4 start byte = (1*6) + 3 = 10 * end line = 2 * * @param chr * @param qstart * @param qend * @return */ public byte[] getSequence(String chr, int qstart, int qend) { FastaIndex.FastaSequenceIndexEntry idxEntry = index.getIndexEntry(chr); if (idxEntry == null) { return null; } try { final int start = Math.max(0, qstart); // qstart should never be < 0 final int end = Math.min((int) idxEntry.getSize(), qend); final int bytesPerLine = idxEntry.getBytesPerLine(); final int basesPerLine = idxEntry.getBasesPerLine(); int nEndBytes = bytesPerLine - basesPerLine; int startLine = start / basesPerLine; int endLine = end / basesPerLine; int base0 = startLine * basesPerLine; // Base at beginning of start line int offset = start - base0; final long position = idxEntry.getPosition(); long startByte = position + startLine * bytesPerLine + offset; int base1 = endLine * basesPerLine; int offset1 = end - base1; long endByte = Math.min(contentLength, position + endLine * bytesPerLine + offset1); if (startByte >= endByte) { return null; } // Read all the bytes in the range. This will include endline characters byte[] allBytes = readBytes(startByte, endByte); // Create the array for the sequence -- this will be "allBytes" without the endline // characters. ByteArrayOutputStream bos = new ByteArrayOutputStream(end - start); int srcPos = 0; int desPos = 0; // Copy first line final int allBytesLength = allBytes.length; if (offset > 0) { int nBases = Math.min(end - start, basesPerLine - offset); bos.write(allBytes, srcPos, nBases); srcPos += (nBases + nEndBytes); desPos += nBases; } while (srcPos < allBytesLength) { int nBases = Math.min(basesPerLine, allBytesLength - srcPos); bos.write(allBytes, srcPos, nBases); srcPos += (nBases + nEndBytes); desPos += nBases; } return bos.toByteArray(); } catch (IOException e) { e .printStackTrace(); // To change body of catch statement use File | Settings | File // Templates. return null; } }
@Override public int getChromosomeLength(String chrname) { return index.getSequenceSize(chrname); }