@Test(dataProvider = "MeetsMinSequenceData") public void testSplitterCompleteCycle( final List<String> mids, final int minSeqLength, final boolean prefixMeets, final boolean suffixMeets) { final SeqGraph graph = new SeqGraph(11); final SeqVertex top = new SeqVertex("AAAAAAAA"); final SeqVertex bot = new SeqVertex("GGGGGGGG"); final List<SeqVertex> v = new ArrayList<>(); for (final String s : mids) { v.add(new SeqVertex(s)); } graph.addVertices(v.toArray(new SeqVertex[v.size()])); graph.addVertices(top, bot); for (final SeqVertex vi : v) { graph.addEdge(top, vi); graph.addEdge(vi, bot); } final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); Assert.assertEquals( splitter.meetsMinMergableSequenceForPrefix(minSeqLength), prefixMeets, "Prefix failed"); Assert.assertEquals( splitter.meetsMinMergableSequenceForSuffix(minSeqLength), suffixMeets, "Suffix failed"); Assert.assertEquals( splitter.meetsMinMergableSequenceForEitherPrefixOrSuffix(minSeqLength), suffixMeets || prefixMeets, "Either prefix or suffix failed"); }
@Test(dataProvider = "PrefixSuffixData") public void testSplitter( final List<String> strings, int expectedPrefixLen, int expectedSuffixLen) { final SeqGraph graph = new SeqGraph(11); final List<SeqVertex> v = new ArrayList<>(); for (final String s : strings) { v.add(new SeqVertex(s)); } graph.addVertices(v.toArray(new SeqVertex[v.size()])); final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen); final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen); final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); splitter.split(); Assert.assertEquals(splitter.prefixV.getSequenceString(), expectedPrefix); Assert.assertEquals(splitter.suffixV.getSequenceString(), expectedSuffix); Assert.assertTrue(splitter.splitGraph.outDegreeOf(splitter.prefixV) <= strings.size()); Assert.assertEquals(splitter.splitGraph.inDegreeOf(splitter.prefixV), 0); Assert.assertTrue(splitter.splitGraph.inDegreeOf(splitter.suffixV) <= strings.size()); Assert.assertEquals(splitter.splitGraph.outDegreeOf(splitter.suffixV), 0); for (final SeqVertex mid : splitter.newMiddles) { Assert.assertNotNull(splitter.splitGraph.getEdge(splitter.prefixV, mid)); Assert.assertNotNull(splitter.splitGraph.getEdge(mid, splitter.suffixV)); } }
/** * Can we safely split up the vertices in toMerge? * * @param graph a graph * @param bot a vertex whose incoming vertices we want to split * @param toMerge the set of vertices we'd be splitting up * @return true if we can safely split up toMerge */ private boolean safeToSplit( final SeqGraph graph, final SeqVertex bot, final Collection<SeqVertex> toMerge) { final Set<SeqVertex> outgoingOfBot = new HashSet<SeqVertex>(graph.outgoingVerticesOf(bot)); for (final SeqVertex m : toMerge) { final Set<BaseEdge> outs = graph.outgoingEdgesOf(m); if (m == bot || outs.size() != 1 || !graph.outgoingVerticesOf(m).contains(bot)) // m == bot => don't allow self cycles in the graph return false; if (outgoingOfBot.contains(m)) // forbid cycles from bottom -> mid return false; } return true; }
// recursively initialize nodeDests from start node private void initNodeDests(SeqPhase phase, SeqNode n) throws Xcept { if (nodeDests.get(n) != null) return; if (stopAtFFTs && n.fft() != null) return; LinkedHashSet<SeqNode> dests = new LinkedHashSet<SeqNode>(); nodeDests.put(n, dests); Iterator<SeqEdge> es = graph.getEdgesFrom(n); while (es.hasNext()) { SeqEdge e = es.next(); SeqNode n1 = e.dest(); dests.add(n1); if (phase != graph.getPhase(n1)) continue; if (stopAtFFTs && n1.fft() != null) continue; initNodeDests(phase, n1); } }
/** * Would factoring out this suffix result in elimating the reference source vertex? * * @param graph the graph * @param commonSuffix the common suffix of all toSplits * @param toSplits the list of vertices we're are trying to split * @return true if toSplit contains the reference source and this ref source has all and only the * bases of commonSuffix */ private boolean wouldEliminateRefSource( final SeqGraph graph, final SeqVertex commonSuffix, final Collection<SeqVertex> toSplits) { for (final SeqVertex toSplit : toSplits) { if (graph.isRefSource(toSplit)) return toSplit.length() == commonSuffix.length(); } return false; }
// constructor public SeqGraphXcons( SeqGraph graph, LinkedHashSet<SeqNode> startNodes, LinkedHashSet<SeqNode> stopNodes, boolean stopAtFFTs) throws Xcept { this.graph = graph; this.startNodes = startNodes; this.stopNodes = stopNodes; this.stopAtFFTs = stopAtFFTs; System.err.println("SeqGraphXCons stopAtFFTs=" + stopAtFFTs); nodeDests = new Hashtable<SeqNode, LinkedHashSet<SeqNode>>(); Iterator<SeqNode> ns = startNodes.iterator(); while (ns.hasNext()) { SeqNode n = ns.next(); SeqPhase p = graph.getPhase(n); initNodeDests(p, n); } System.err.println(" nodeDests=" + nodeDests); }
/** * Simple single-function interface to split and then update a graph * * @param graph the graph containing the vertices in toMerge * @param v The bottom node whose incoming vertices we'd like to split * @return true if some useful splitting was done, false otherwise */ public boolean split(final SeqGraph graph, final SeqVertex v) { if (graph == null) throw new IllegalArgumentException("graph cannot be null"); if (v == null) throw new IllegalArgumentException("v cannot be null"); if (!graph.vertexSet().contains(v)) throw new IllegalArgumentException("graph doesn't contain vertex v " + v); final Collection<SeqVertex> toSplit = graph.incomingVerticesOf(v); if (toSplit.size() < 2) // Can only split at least 2 vertices return false; else if (!safeToSplit(graph, v, toSplit)) { return false; } else { final SeqVertex suffixVTemplate = commonSuffix(toSplit); if (suffixVTemplate.isEmpty()) { return false; } else if (wouldEliminateRefSource(graph, suffixVTemplate, toSplit)) { return false; } else if (allVerticesAreTheCommonSuffix(suffixVTemplate, toSplit)) { return false; } else { final List<BaseEdge> edgesToRemove = new LinkedList<BaseEdge>(); // graph.printGraph(new File("split.pre_" + v.getSequenceString() + "." + // counter + ".dot"), 0); for (final SeqVertex mid : toSplit) { // create my own copy of the suffix final SeqVertex suffixV = new SeqVertex(suffixVTemplate.getSequence()); graph.addVertex(suffixV); final SeqVertex prefixV = mid.withoutSuffix(suffixV.getSequence()); final BaseEdge out = graph.outgoingEdgeOf(mid); final SeqVertex incomingTarget; if (prefixV == null) { // this node is entirely explained by suffix incomingTarget = suffixV; } else { incomingTarget = prefixV; graph.addVertex(prefixV); graph.addEdge(prefixV, suffixV, new BaseEdge(out.isRef(), 1)); edgesToRemove.add(out); } graph.addEdge(suffixV, graph.getEdgeTarget(out), out.copy()); for (final BaseEdge in : graph.incomingEdgesOf(mid)) { graph.addEdge(graph.getEdgeSource(in), incomingTarget, in.copy()); edgesToRemove.add(in); } } graph.removeAllVertices(toSplit); graph.removeAllEdges(edgesToRemove); // graph.printGraph(new File("split.post_" + v.getSequenceString() + "." + // counter++ + ".dot"), 0); return true; } } }
@Test(dataProvider = "CompleteCycleData") public void testSplitterCompleteCycle( final List<String> strings, final boolean hasTop, final boolean hasBot) { final SeqGraph graph = new SeqGraph(11); int edgeWeight = 1; final SeqVertex top = hasTop ? new SeqVertex("AAAAAAAA") : null; final SeqVertex bot = hasBot ? new SeqVertex("GGGGGGGG") : null; final List<SeqVertex> v = new ArrayList<>(); for (final String s : strings) { v.add(new SeqVertex(s)); } graph.addVertices(v.toArray(new SeqVertex[v.size()])); final SeqVertex first = v.get(0); if (hasTop) { graph.addVertex(top); for (final SeqVertex vi : v) graph.addEdge(top, vi, new BaseEdge(vi == first, edgeWeight++)); } if (hasBot) { graph.addVertex(bot); for (final SeqVertex vi : v) graph.addEdge(vi, bot, new BaseEdge(vi == first, edgeWeight++)); } final Set<String> haplotypes = new HashSet<>(); final KBestHaplotypeFinder originalPaths = new KBestHaplotypeFinder((SeqGraph) graph.clone(), graph.getSources(), graph.getSinks()); for (final KBestHaplotype path : originalPaths) haplotypes.add(new String(path.bases())); final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); splitter.split(); if (PRINT_GRAPHS) graph.printGraph( new File(Utils.join("_", strings) + "_" + hasTop + "_" + hasBot + ".original.dot"), 0); if (PRINT_GRAPHS) splitter.splitGraph.printGraph( new File(Utils.join("_", strings) + "_" + hasTop + "_" + hasBot + ".split.dot"), 0); splitter.updateGraph(top, bot); if (PRINT_GRAPHS) graph.printGraph( new File(Utils.join("_", strings) + "_" + hasTop + "_" + hasBot + ".updated.dot"), 0); final KBestHaplotypeFinder splitPaths = new KBestHaplotypeFinder(graph, graph.getSources(), graph.getSinks()); for (final KBestHaplotype path : splitPaths) { final String h = new String(path.bases()); Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); } final List<byte[]> sortedOriginalPaths = new ArrayList<>(originalPaths.size()); for (final KBestHaplotype kbh : originalPaths.unique()) sortedOriginalPaths.add(kbh.bases()); Collections.sort(sortedOriginalPaths, BaseUtils.BASES_COMPARATOR); final List<byte[]> sortedSplitPaths = new ArrayList<>(splitPaths.size()); for (final KBestHaplotype kbh : splitPaths.unique()) sortedSplitPaths.add(kbh.bases()); Collections.sort(sortedSplitPaths, BaseUtils.BASES_COMPARATOR); Assert.assertEquals( sortedSplitPaths, sortedOriginalPaths, Utils.join("_", strings) + "_" + hasTop + "_" + hasBot); }