public List<Block> chunk(String resourceId, List<TokensLine> fragments) { if (fragments.size() < blockSize) { return Collections.emptyList(); } TokensLine[] fragmentsArr = fragments.toArray(new TokensLine[fragments.size()]); List<Block> blocks = Lists.newArrayListWithCapacity(fragmentsArr.length - blockSize + 1); long hash = 0; int first = 0; int last = 0; for (; last < blockSize - 1; last++) { hash = hash * PRIME_BASE + fragmentsArr[last].getHashCode(); } for (; last < fragmentsArr.length; last++, first++) { TokensLine firstFragment = fragmentsArr[first]; TokensLine lastFragment = fragmentsArr[last]; // add last statement to hash hash = hash * PRIME_BASE + lastFragment.getHashCode(); // create block Block block = new Block( resourceId, new ByteArray(hash), first, firstFragment.getStartLine(), lastFragment.getEndLine()); block.setStartUnit(firstFragment.getStartUnit()); block.setEndUnit(lastFragment.getEndUnit()); blocks.add(block); // remove first statement from hash hash -= power * firstFragment.getHashCode(); } return blocks; }
/** * To simplify testing we assume that each block starts from a new line and contains {@link * #LINES_PER_BLOCK} lines, so we can simply use index and hash. */ protected static Block newBlock(String resourceId, ByteArray hash, int index) { return Block.builder() .setResourceId(resourceId) .setBlockHash(hash) .setIndexInFile(index) .setLines(index, index + LINES_PER_BLOCK) .build(); }
/** @return ArrayList as we need a serializable object */ public ArrayList<Block> chunk(String resourceId, List<TokensLine> fragments) { List<TokensLine> filtered = Lists.newArrayList(); int i = 0; while (i < fragments.size()) { TokensLine first = fragments.get(i); int j = i + 1; while (j < fragments.size() && fragments.get(j).getValue().equals(first.getValue())) { j++; } filtered.add(fragments.get(i)); if (i < j - 1) { filtered.add(fragments.get(j - 1)); } i = j; } fragments = filtered; if (fragments.size() < blockSize) { return Lists.newArrayList(); } TokensLine[] fragmentsArr = fragments.toArray(new TokensLine[fragments.size()]); ArrayList<Block> blocks = Lists.newArrayListWithCapacity(fragmentsArr.length - blockSize + 1); long hash = 0; int first = 0; int last = 0; for (; last < blockSize - 1; last++) { hash = hash * PRIME_BASE + fragmentsArr[last].getHashCode(); } Block.Builder blockBuilder = Block.builder().setResourceId(resourceId); for (; last < fragmentsArr.length; last++, first++) { TokensLine firstFragment = fragmentsArr[first]; TokensLine lastFragment = fragmentsArr[last]; // add last statement to hash hash = hash * PRIME_BASE + lastFragment.getHashCode(); // create block Block block = blockBuilder .setBlockHash(new ByteArray(hash)) .setIndexInFile(first) .setLines(firstFragment.getStartLine(), lastFragment.getEndLine()) .setUnit(firstFragment.getStartUnit(), lastFragment.getEndUnit()) .build(); blocks.add(block); // remove first statement from hash hash -= power * firstFragment.getHashCode(); } return blocks; }
/** Constructs CloneGroup and saves it. */ @Override public void endOfGroup() { ClonePart origin = null; CloneGroup.Builder builder = CloneGroup.builder().setLength(length); List<ClonePart> parts = Lists.newArrayListWithCapacity(count); for (int[] b : blockNumbers) { Block firstBlock = text.getBlock(b[0]); Block lastBlock = text.getBlock(b[1]); ClonePart part = new ClonePart( firstBlock.getResourceId(), firstBlock.getIndexInFile(), firstBlock.getStartLine(), lastBlock.getEndLine()); // TODO Godin: maybe use FastStringComparator here ? if (originResourceId.equals(part.getResourceId())) { // part from origin if (origin == null) { origin = part; // To calculate length important to use the origin, because otherwise block may come from // DB without required data builder.setLengthInUnits(lastBlock.getEndUnit() - firstBlock.getStartUnit() + 1); } else if (part.getUnitStart() < origin.getUnitStart()) { origin = part; } } parts.add(part); } Collections.sort(parts, ContainsInComparator.CLONEPART_COMPARATOR); builder.setOrigin(origin).setParts(parts); filter(builder.build()); reset(); }
/** * Given file with two lines, containing following statements: * * <pre> * 0: A,B,A,B * 1: A,B,A * </pre> * * with block size 5 each block will span both lines, and hashes will be: * * <pre> * A,B,A,B,A=1 * B,A,B,A,B=2 * A,B,A,B,A=1 * </pre> * * Expected: one clone with two parts, which contain exactly the same lines */ @Test public void same_lines_but_different_indexes() { CloneIndex cloneIndex = createIndex(); Block.Builder block = Block.builder().setResourceId("a").setLines(0, 1); Block[] fileBlocks = new Block[] { block.setBlockHash(new ByteArray("1".getBytes())).setIndexInFile(0).build(), block.setBlockHash(new ByteArray("2".getBytes())).setIndexInFile(1).build(), block.setBlockHash(new ByteArray("1".getBytes())).setIndexInFile(2).build() }; List<CloneGroup> clones = detect(cloneIndex, fileBlocks); print(clones); assertThat(clones.size(), is(1)); Iterator<CloneGroup> clonesIterator = clones.iterator(); CloneGroup clone = clonesIterator.next(); assertThat(clone.getCloneUnitLength(), is(1)); assertThat(clone.getCloneParts().size(), is(2)); assertThat(clone.getOriginPart(), is(new ClonePart("a", 0, 0, 1))); assertThat(clone.getCloneParts(), hasItem(new ClonePart("a", 0, 0, 1))); assertThat(clone.getCloneParts(), hasItem(new ClonePart("a", 2, 0, 1))); }