@Test public void testWhiteSpacesInSingleSegment() throws URISyntaxException { ITextUnit tu1 = new TextUnit("tu1"); TextContainer source = tu1.getSource(); source.append(new Segment("seg1", new TextFragment(" Text "))); assertEquals(1, source.getSegments().count()); assertEquals(1, source.count()); assertEquals(" Text ", source.get(0).toString()); assertTrue(source.get(0).isSegment()); params.setSegmentationStrategy(SegmStrategy.DEEPEN_EXISTING); segStep.handleTextUnit(new Event(EventType.TEXT_UNIT, tu1)); assertEquals(1, source.getSegments().count()); assertEquals(3, source.count()); assertEquals(" ", source.get(0).toString()); assertFalse(source.get(0).isSegment()); assertEquals("Text", source.get(1).toString()); assertTrue(source.get(1).isSegment()); assertEquals(" ", source.get(2).toString()); assertFalse(source.get(2).isSegment()); }
protected static long count( Class<? extends BaseCounter> classRef, Object text, LocaleId language) { if (text == null) return 0L; if (Util.isNullOrEmpty(language)) return 0L; if (text instanceof ITextUnit) { ITextUnit tu = (ITextUnit) text; // if (tu.hasTarget(language)) // return count(classRef, tu.getTarget(language), language); // else // Only words in the source are counted return count(classRef, tu.getSource(), language); } else if (text instanceof Segment) { Segment seg = (Segment) text; return count(classRef, seg.getContent(), language); } else if (text instanceof TextContainer) { // This work on segments' content (vs. parts' content) TextContainer tc = (TextContainer) text; long res = 0; for (Segment seg : tc.getSegments()) { res += count(classRef, seg, language); } return res; } else if (text instanceof TextFragment) { TextFragment tf = (TextFragment) text; return count(classRef, TextUnitUtil.getText(tf), language); } else if (text instanceof String) { instantiateCounter(classRef); if (counter == null) return 0L; return counter.doCount((String) text, language); } return 0; }
@Test public void testSegmentationStrategy() { ITextUnit tu1 = new TextUnit("tu1"); TextContainer source = tu1.getSource(); source.append(new Segment("seg1", new TextFragment("Sentence1. Sentence2."))); source.append(new TextPart(" Text part 1. ")); source.append(new Segment("seg1", new TextFragment("Sentence3."))); assertEquals(2, source.getSegments().count()); assertEquals("Sentence1. Sentence2.", source.get(0).toString()); assertTrue(source.get(0).isSegment()); assertEquals(" Text part 1. ", source.get(1).toString()); assertFalse(source.get(1).isSegment()); assertEquals("Sentence3.", source.get(2).toString()); assertTrue(source.get(2).isSegment()); // 1 params.setSegmentationStrategy(SegmStrategy.KEEP_EXISTING); segStep.handleTextUnit(new Event(EventType.TEXT_UNIT, tu1)); // Check if not changed assertEquals(2, source.getSegments().count()); assertEquals(3, source.count()); assertEquals("Sentence1. Sentence2.", source.get(0).toString()); assertTrue(source.get(0).isSegment()); assertEquals(" Text part 1. ", source.get(1).toString()); assertFalse(source.get(1).isSegment()); assertEquals("Sentence3.", source.get(2).toString()); assertTrue(source.get(2).isSegment()); // 2 params.setSegmentationStrategy(SegmStrategy.DEEPEN_EXISTING); segStep.handleTextUnit(new Event(EventType.TEXT_UNIT, tu1)); // Check if did change assertEquals(3, source.getSegments().count()); assertEquals(5, source.count()); assertEquals("Sentence1.", source.get(0).toString()); assertTrue(source.get(0).isSegment()); assertEquals(" ", source.get(1).toString()); assertFalse(source.get(1).isSegment()); assertEquals("Sentence2.", source.get(2).toString()); assertTrue(source.get(2).isSegment()); assertEquals(" Text part 1. ", source.get(3).toString()); assertFalse(source.get(3).isSegment()); assertEquals("Sentence3.", source.get(4).toString()); assertTrue(source.get(4).isSegment()); // 3 params.setSegmentationStrategy(SegmStrategy.OVERWRITE_EXISTING); segStep.handleTextUnit(new Event(EventType.TEXT_UNIT, tu1)); // Check if did change assertEquals(4, source.getSegments().count()); assertEquals(7, source.count()); assertEquals("Sentence1.", source.get(0).toString()); assertTrue(source.get(0).isSegment()); assertEquals(" ", source.get(1).toString()); assertFalse(source.get(1).isSegment()); assertEquals("Sentence2.", source.get(2).toString()); assertTrue(source.get(2).isSegment()); assertEquals(" ", source.get(3).toString()); assertFalse(source.get(3).isSegment()); assertEquals("Text part 1.", source.get(4).toString()); assertTrue(source.get(4).isSegment()); assertEquals(" ", source.get(5).toString()); assertFalse(source.get(5).isSegment()); assertEquals("Sentence3.", source.get(6).toString()); assertTrue(source.get(6).isSegment()); }