/** * Set all parameters in one 'row' * * <p>WARNIGN: If we don't do it this way, we get strange errors due to array resizing (array * appears to be filled with zeros after being set) */ void set(int idx, int leftIdx, int rightIdx, int midPos, TIntArrayList intersecting) { // Assign values left[idx] = leftIdx; right[idx] = rightIdx; mid[idx] = midPos; // Assign intersecting values if (intersecting.isEmpty()) { intersectFilePosStart[idx] = intersectFilePosEnd[idx] = null; } else { // First try to collapse intersecting all intervals as a single file block if (intersecting.size() > 0 && consecutiveFileBlock(intersecting)) { // OK, we can collapse all entries into a file block intersectFilePosStart[idx] = new long[1]; intersectFilePosEnd[idx] = new long[1]; intersectFilePosStart[idx][0] = consecutiveFileBlockMin(intersecting); intersectFilePosEnd[idx][0] = consecutiveFileBlockMax(intersecting); } else { // Add entries individually intersectFilePosStart[idx] = new long[intersecting.size()]; intersectFilePosEnd[idx] = new long[intersecting.size()]; for (int i = 0; i < intersecting.size(); i++) { int j = intersecting.get(i); intersectFilePosStart[idx][i] = vcfIndexChromo.getFilePosStart(j); intersectFilePosEnd[idx][i] = vcfIndexChromo.getFilePosEnd(j); } } } }
@Override public void build() { TIntArrayList idxs = new TIntArrayList(vcfIndexChromo.size()); for (int i = 0; i < vcfIndexChromo.size(); i++) idxs.add(i); build(idxs); inSync = true; }
long consecutiveFileBlockSize(TIntArrayList idxs) { long max = -1; long min = Long.MAX_VALUE; for (int i = 0; i < idxs.size(); i++) { int idx = idxs.get(i); min = Math.min(min, vcfIndexChromo.getFilePosStart(idx)); max = Math.max(max, vcfIndexChromo.getFilePosEnd(idx)); } return max - min; }
/** * Index entries in VcfIndexDataChromo * * @return Index of added item (-1 if no item was added) */ int build(TIntArrayList idxs) { if (idxs.isEmpty()) return -1; // Find middle position // Note:If we mode the 'mid' point by one base, the probability of intersecting // an interval is significantly reduced (most entries are SNPs). This reduces // the index size, the number of 'file.seek()' operations and speeds up the index. int center = mean(idxs); int firstStart = vcfIndexChromo.getStart(idxs.get(0)); if (center > firstStart) center--; // Index of entry to be added int idx = nextEntry(); // Split indexes into left, right and intersecting TIntArrayList left = new TIntArrayList(); TIntArrayList right = new TIntArrayList(); TIntArrayList intersecting = new TIntArrayList(); // Try to collapse consecutive entries if there are only a few (i.e. less // than COLLAPSE_MAX_NUM_ENTRIES) or the block size is small (less // than COLLAPSE_MAX_BLOCK_SIZE bytes) if (consecutiveFileBlock(idxs) && // ((idxs.size() < COLLAPSE_MAX_NUM_ENTRIES) || (consecutiveFileBlockSize(idxs) < maxBlockSize)) // ) { // Too few intervals forming a consecutive block? // Just add them to the intersect for (int i = 0; i < idxs.size(); i++) { int j = idxs.get(i); intersecting.add(j); } } else { // Add indexes into left, right and intersecting for (int i = 0; i < idxs.size(); i++) { int j = idxs.get(i); if (vcfIndexChromo.getEnd(j) < center) left.add(j); else if (vcfIndexChromo.getStart(j) > center) right.add(j); else intersecting.add(j); } } // Recurse int leftIdx = build(left); int rightIdx = build(right); // Create this entry set(idx, leftIdx, rightIdx, center, intersecting); return idx; }
/** Mean coordinates from entries indexed by 'idxs' */ int mean(TIntArrayList idxs) { if (idxs.isEmpty()) return 0; TIntArrayList coordinates = new TIntArrayList(2 * idxs.size()); for (int i = 0; i < idxs.size(); i++) { int idx = idxs.get(i); coordinates.add(vcfIndexChromo.getStart(idx)); coordinates.add(vcfIndexChromo.getEnd(idx)); } coordinates.sort(); return coordinates.get(coordinates.size() / 2); }
/** Are entries indexed by 'idxs' consecutive position in the file? */ boolean consecutiveFileBlock(TIntArrayList idxs) { long end = -1; for (int i = 0; i < idxs.size(); i++) { int idx = idxs.get(i); if (end < 0) end = vcfIndexChromo.getFilePosEnd(idx); long start = vcfIndexChromo.getFilePosStart(idx); if ((start - end) > 0) return false; // Prepare for next iteration end = vcfIndexChromo.getFilePosEnd(idx); } return true; }
long consecutiveFileBlockMax(TIntArrayList idxs) { long max = -1; for (int i = 0; i < idxs.size(); i++) { int idx = idxs.get(i); max = Math.max(max, vcfIndexChromo.getFilePosEnd(idx)); } return max; }
@SuppressWarnings("unchecked") public VcfIndexTree(VcfFileIterator vcf, VcfIndexDataChromo vcfIndexChromo) { this.vcfIndexChromo = vcfIndexChromo; chromosome = (vcfIndexChromo != null ? vcfIndexChromo.getChromosome() : null); left = new int[INITIAL_CAPACITY]; right = new int[INITIAL_CAPACITY]; mid = new int[INITIAL_CAPACITY]; intersectFilePosStart = new long[INITIAL_CAPACITY][]; intersectFilePosEnd = new long[INITIAL_CAPACITY][]; intersect = new List[INITIAL_CAPACITY]; size = 0; }
/** Find all interval indexes from intervals within [startIdx, endIdx] that intersect 'pos' */ int[] intersectIndexes(int startIdx, int endIdx, int pos) { List<Integer> list = null; // Find all intersecting intervals for (int idx = startIdx; idx <= endIdx; idx++) { if (vcfIndexChromo.intersects(idx, pos)) { // Add this position if (list == null) list = new ArrayList<>(); list.add(idx); } } // No results if (list == null) return new int[0]; // Create an array int i = 0; int ints[] = new int[list.size()]; for (int idx : list) ints[i++] = idx; return ints; }
@Override public boolean isEmpty() { return vcfIndexChromo.size() <= 0; }